Replace shadow pagetable code with shadow2.
authortdeegan@york.uk.xensource.com <tdeegan@york.uk.xensource.com>
Wed, 16 Aug 2006 16:02:35 +0000 (17:02 +0100)
committertdeegan@york.uk.xensource.com <tdeegan@york.uk.xensource.com>
Wed, 16 Aug 2006 16:02:35 +0000 (17:02 +0100)
76 files changed:
.hgtags
tools/examples/xmexample.hvm
tools/libxc/xc_domain.c
tools/libxc/xc_hvm_build.c
tools/libxc/xc_linux_build.c
tools/libxc/xc_linux_save.c
tools/libxc/xenctrl.h
tools/misc/xc_shadow.c
tools/python/xen/lowlevel/xc/xc.c
tools/python/xen/xend/XendDomain.py
tools/python/xen/xend/XendDomainInfo.py
tools/python/xen/xend/image.py
tools/python/xen/xm/create.py
xen/arch/x86/Makefile
xen/arch/x86/audit.c [deleted file]
xen/arch/x86/dom0_ops.c
xen/arch/x86/domain.c
xen/arch/x86/domain_build.c
xen/arch/x86/hvm/hvm.c
xen/arch/x86/hvm/platform.c
xen/arch/x86/hvm/svm/svm.c
xen/arch/x86/hvm/svm/vmcb.c
xen/arch/x86/hvm/vlapic.c
xen/arch/x86/hvm/vmx/vmcs.c
xen/arch/x86/hvm/vmx/vmx.c
xen/arch/x86/mm.c
xen/arch/x86/setup.c
xen/arch/x86/shadow.c [deleted file]
xen/arch/x86/shadow2-common.c [new file with mode: 0644]
xen/arch/x86/shadow2.c [new file with mode: 0644]
xen/arch/x86/shadow32.c [deleted file]
xen/arch/x86/shadow_guest32.c [deleted file]
xen/arch/x86/shadow_guest32pae.c [deleted file]
xen/arch/x86/shadow_public.c [deleted file]
xen/arch/x86/smpboot.c
xen/arch/x86/traps.c
xen/arch/x86/x86_32/domain_page.c
xen/arch/x86/x86_32/mm.c
xen/arch/x86/x86_64/mm.c
xen/arch/x86/x86_64/traps.c
xen/common/acm_ops.c
xen/common/grant_table.c
xen/common/keyhandler.c
xen/common/memory.c
xen/drivers/char/console.c
xen/include/asm-x86/bitops.h
xen/include/asm-x86/config.h
xen/include/asm-x86/domain.h
xen/include/asm-x86/grant_table.h
xen/include/asm-x86/hvm/hvm.h
xen/include/asm-x86/hvm/support.h
xen/include/asm-x86/hvm/vcpu.h
xen/include/asm-x86/hvm/vmx/vmcs.h
xen/include/asm-x86/hvm/vmx/vmx.h
xen/include/asm-x86/mm.h
xen/include/asm-x86/msr.h
xen/include/asm-x86/page-guest32.h
xen/include/asm-x86/page.h
xen/include/asm-x86/perfc_defn.h
xen/include/asm-x86/processor.h
xen/include/asm-x86/shadow.h
xen/include/asm-x86/shadow2-multi.h [new file with mode: 0644]
xen/include/asm-x86/shadow2-private.h [new file with mode: 0644]
xen/include/asm-x86/shadow2-types.h [new file with mode: 0644]
xen/include/asm-x86/shadow2.h [new file with mode: 0644]
xen/include/asm-x86/shadow_64.h [deleted file]
xen/include/asm-x86/shadow_ops.h [deleted file]
xen/include/asm-x86/shadow_public.h [deleted file]
xen/include/asm-x86/x86_32/page-2level.h
xen/include/asm-x86/x86_32/page-3level.h
xen/include/asm-x86/x86_64/page.h
xen/include/public/dom0_ops.h
xen/include/xen/domain_page.h
xen/include/xen/lib.h
xen/include/xen/list.h
xen/include/xen/sched.h

diff --git a/.hgtags b/.hgtags
index b097c216b9aa4da4a0c3269e3f6e744dafda66a5..41fa5ab702015481924c046be2c2fb7c2d8fe3b6 100644 (file)
--- a/.hgtags
+++ b/.hgtags
@@ -15,3 +15,13 @@ fb875591fd72e15c31879c0e9034d99b80225595 RELEASE-2.0.4
 c8fdb0caa77b429cf47f9707926e83947778cb48 RELEASE-3.0.0
 af0573e9e5258db0a9d28aa954dd302ddd2c2d23 3.0.2-rc
 d0d3fef37685be264a7f52201f8ef44c030daad3 3.0.2-branched
+6e864d7de9db066f92bea505d256bfe286200fed last-code-review
+a898a6510c5db4e3d1f69d40fcacb540643b0f22 mainline
+bfa6f4a0c594bc0ebd896437d69857b58dab0988 last-code-review
+fc6cbf31bd883bc76ceb97f4b817ac88078d696a latest patch to unstable
+8e55c5c1147589b7a6a1875384d4317aec7ccf84 mainline
+2d2ed4d9b1c14aeee29dfdd77acd6017d31290cd mainline
+0e32095a7b4611d18a82052a9d5b23e474f91af9 mainline
+88e6bd5e2b5439f97e1d50a8724103c619aeaadf mainline
+5233c4b076b9aa073eff63508461b7bfa597737c mainline
+fda70200da01b89d5339342df6c0db372369a16d mainline
index 396274c8600861588778218a9c55b22389cf830e..dd07a3b90eedbecc8be444c8172e3d8f80aa62f5 100644 (file)
@@ -27,6 +27,10 @@ builder='hvm'
 #          and modules. Allocating less than 32MBs is not recommended.
 memory = 128
 
+# Shadow pagetable memory for the domain, in MB.
+# Should be at least 2KB per MB of domain memory, plus a few MB per vcpu.
+shadow_memory = 8
+
 # A name for your domain. All domains must have different names.
 name = "ExampleHVMDomain"
 
index 51117117f407d985f2d9b79d556b1df48e24a93f..801e35ea0865e20ee39f84698ef98366bdf3c8f3 100644 (file)
@@ -213,21 +213,28 @@ int xc_shadow_control(int xc_handle,
                       unsigned int sop,
                       unsigned long *dirty_bitmap,
                       unsigned long pages,
-                      xc_shadow_control_stats_t *stats )
+                      unsigned long *mb,
+                      uint32_t mode,
+                      xc_shadow_control_stats_t *stats)
 {
     int rc;
     DECLARE_DOM0_OP;
     op.cmd = DOM0_SHADOW_CONTROL;
     op.u.shadow_control.domain = (domid_t)domid;
     op.u.shadow_control.op     = sop;
-    set_xen_guest_handle(op.u.shadow_control.dirty_bitmap, dirty_bitmap);
     op.u.shadow_control.pages  = pages;
+    op.u.shadow_control.mb     = mb ? *mb : 0;
+    op.u.shadow_control.mode   = mode;
+    set_xen_guest_handle(op.u.shadow_control.dirty_bitmap, dirty_bitmap);
 
     rc = do_dom0_op(xc_handle, &op);
 
     if ( stats )
         memcpy(stats, &op.u.shadow_control.stats,
                sizeof(xc_shadow_control_stats_t));
+    
+    if ( mb ) 
+        *mb = op.u.shadow_control.mb;
 
     return (rc == 0) ? op.u.shadow_control.pages : rc;
 }
@@ -391,7 +398,7 @@ int xc_domain_memory_populate_physmap(int xc_handle,
 
     if ( err > 0 )
     {
-        DPRINTF("Failed deallocation for dom %d: %ld pages order %d\n",
+        DPRINTF("Failed allocation for dom %d: %ld pages order %d\n",
                 domid, nr_extents, extent_order);
         errno = EBUSY;
         err = -1;
index d4799abc876b7448592e84cf52d42a79b454120a..173c6733ee898480285cb9a49aaed4847a5d0091 100644 (file)
@@ -396,6 +396,19 @@ static int xc_hvm_build_internal(int xc_handle,
         goto error_out;
     }
 
+    /* HVM domains must be put into shadow2 mode at the start of day */
+    if ( xc_shadow_control(xc_handle, domid, DOM0_SHADOW2_CONTROL_OP_ENABLE,
+                           NULL, 0, NULL, 
+                           DOM0_SHADOW2_CONTROL_FLAG_ENABLE 
+                           | DOM0_SHADOW2_CONTROL_FLAG_REFCOUNT
+                           | DOM0_SHADOW2_CONTROL_FLAG_TRANSLATE
+                           | DOM0_SHADOW2_CONTROL_FLAG_EXTERNAL, 
+                           NULL) ) 
+    {
+        PERROR("Could not enable shadow paging for domain.\n");
+        goto error_out;
+    }        
+
     memset(ctxt, 0, sizeof(*ctxt));
 
     ctxt->flags = VGCF_HVM_GUEST;
index 9d7ea54a86c6843c8cb79ae0375d0356624635b5..116429a72928e075d85335aef4f89f21dedfe936 100644 (file)
@@ -972,7 +972,7 @@ static int setup_guest(int xc_handle,
         /* Enable shadow translate mode */
         if ( xc_shadow_control(xc_handle, dom,
                                DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE,
-                               NULL, 0, NULL) < 0 )
+                               NULL, 0, NULL, 0, NULL) < 0 )
         {
             PERROR("Could not enable translation mode");
             goto error_out;
index 8cf21dced5f612a7fc8ed6a0836e94eb2d1877f7..49d212995e49ea1ed054c0c075c4b45b5e230c43 100644 (file)
@@ -338,13 +338,13 @@ static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn,
         int i;
 
         xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_CLEAN,
-                          arr, max_pfn, NULL);
+                          arr, max_pfn, NULL, 0, NULL);
         DPRINTF("#Flush\n");
         for ( i = 0; i < 40; i++ ) {
             usleep(50000);
             now = llgettimeofday();
             xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_PEEK,
-                              NULL, 0, &stats);
+                              NULL, 0, NULL, 0, &stats);
 
             DPRINTF("now= %lld faults= %" PRId32 " dirty= %" PRId32
                     " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
@@ -727,7 +727,7 @@ int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
 
         if (xc_shadow_control(xc_handle, dom,
                               DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
-                              NULL, 0, NULL ) < 0) {
+                              NULL, 0, NULL, 0, NULL) < 0) {
             ERR("Couldn't enable shadow mode");
             goto out;
         }
@@ -879,7 +879,7 @@ int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
                but this is fast enough for the moment. */
             if (!last_iter && xc_shadow_control(
                     xc_handle, dom, DOM0_SHADOW_CONTROL_OP_PEEK,
-                    to_skip, max_pfn, NULL) != max_pfn) {
+                    to_skip, max_pfn, NULL, 0, NULL) != max_pfn) {
                 ERR("Error peeking shadow bitmap");
                 goto out;
             }
@@ -1084,8 +1084,9 @@ int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
                         (unsigned long)ctxt.user_regs.edx);
             }
 
-            if (xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_CLEAN,
-                                  to_send, max_pfn, &stats ) != max_pfn) {
+            if (xc_shadow_control(xc_handle, dom, 
+                                  DOM0_SHADOW_CONTROL_OP_CLEAN, to_send, 
+                                  max_pfn, NULL, 0, &stats) != max_pfn) {
                 ERR("Error flushing shadow PT");
                 goto out;
             }
@@ -1174,8 +1175,9 @@ int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
  out:
 
     if (live) {
-        if(xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_OFF,
-                             NULL, 0, NULL ) < 0) {
+        if(xc_shadow_control(xc_handle, dom, 
+                             DOM0_SHADOW_CONTROL_OP_OFF,
+                             NULL, 0, NULL, 0, NULL) < 0) {
             DPRINTF("Warning - couldn't disable shadow mode");
         }
     }
index 2d301b2c43c9f5089e3d5210fa4e2a6ae56e6da7..a66a11839d432c8a612409349e7a563fa9356aa0 100644 (file)
@@ -323,6 +323,8 @@ int xc_shadow_control(int xc_handle,
                       unsigned int sop,
                       unsigned long *dirty_bitmap,
                       unsigned long pages,
+                      unsigned long *mb,
+                      uint32_t mode,
                       xc_shadow_control_stats_t *stats);
 
 int xc_bvtsched_global_set(int xc_handle,
index 83c52ebc19ba6b8a21718b54d7096e52e5d2ca94..f0f60c9c5c0dea1ae29d9d27f78ee6a28b856b2f 100644 (file)
@@ -60,6 +60,8 @@ int main(int argc, char *argv[])
                            mode, 
                            NULL,
                            0,
+                           NULL,
+                           0,
                            NULL) < 0 )
     {    
         fprintf(stderr, "Error reseting performance counters: %d (%s)\n",
index 3e5a9624d8f2644c7fce43298fa14137abae04b8..2c55ca079f9266a08abbadaf6fa9461f282affc9 100644 (file)
@@ -669,6 +669,59 @@ static PyObject *pyxc_sedf_domain_get(XcObject *self, PyObject *args)
                          "weight",    weight);
 }
 
+static PyObject *pyxc_shadow_control(PyObject *self,
+                                     PyObject *args,
+                                     PyObject *kwds)
+{
+    XcObject *xc = (XcObject *)self;
+
+    uint32_t dom;
+    int op=0;
+
+    static char *kwd_list[] = { "dom", "op", NULL };
+
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "i|i", kwd_list, 
+                                      &dom, &op) )
+        return NULL;
+    
+    if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0, NULL, 0, NULL) 
+         < 0 )
+        return PyErr_SetFromErrno(xc_error);
+    
+    Py_INCREF(zero);
+    return zero;
+}
+
+static PyObject *pyxc_shadow_mem_control(PyObject *self,
+                                         PyObject *args,
+                                         PyObject *kwds)
+{
+    XcObject *xc = (XcObject *)self;
+    int op;
+    uint32_t dom;
+    int mbarg = -1;
+    unsigned long mb;
+
+    static char *kwd_list[] = { "dom", "mb", NULL };
+
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "i|i", kwd_list, 
+                                      &dom, &mbarg) )
+        return NULL;
+    
+    if ( mbarg < 0 ) 
+        op = DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION;
+    else 
+    {
+        mb = mbarg;
+        op = DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION;
+    }
+    if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0, &mb, 0, NULL) < 0 )
+        return PyErr_SetFromErrno(xc_error);
+    
+    mbarg = mb;
+    return Py_BuildValue("i", mbarg);
+}
+
 static PyObject *pyxc_sched_credit_domain_set(XcObject *self,
                                               PyObject *args,
                                               PyObject *kwds)
@@ -1119,6 +1172,22 @@ static PyMethodDef pyxc_methods[] = {
       "Returns [dict]: information about Xen"
       "        [None]: on failure.\n" },
 
+    { "shadow_control", 
+      (PyCFunction)pyxc_shadow_control, 
+      METH_VARARGS | METH_KEYWORDS, "\n"
+      "Set parameter for shadow pagetable interface\n"
+      " dom [int]:   Identifier of domain.\n"
+      " op [int, 0]: operation\n\n"
+      "Returns: [int] 0 on success; -1 on error.\n" },
+
+    { "shadow_mem_control", 
+      (PyCFunction)pyxc_shadow_mem_control, 
+      METH_VARARGS | METH_KEYWORDS, "\n"
+      "Set or read shadow pagetable memory use\n"
+      " dom [int]:   Identifier of domain.\n"
+      " mb [int, -1]: MB of shadow memory this domain should have.\n\n"
+      "Returns: [int] MB of shadow memory in use by this domain.\n" },
+
     { "domain_setmaxmem", 
       (PyCFunction)pyxc_domain_setmaxmem, 
       METH_VARARGS, "\n"
index 52cca550d440b542d256b40c5812ba2f4b88a1a0..c253dc2777e5c1ffc521c8df7ee8381baba84a3f 100644 (file)
@@ -532,6 +532,30 @@ class XendDomain:
         except Exception, ex:
             raise XendError(str(ex))
 
+    def domain_shadow_control(self, domid, op):
+        """Shadow page control."""
+        dominfo = self.domain_lookup(domid)
+        try:
+            return xc.shadow_control(dominfo.getDomid(), op)
+        except Exception, ex:
+            raise XendError(str(ex))
+
+    def domain_shadow_mem_get(self, domid):
+        """Get shadow pagetable memory allocation."""
+        dominfo = self.domain_lookup(domid)
+        try:
+            return xc.shadow_mem_control(dominfo.getDomid())
+        except Exception, ex:
+            raise XendError(str(ex))
+
+    def domain_shadow_mem_set(self, domid, mb):
+        """Set shadow pagetable memory allocation."""
+        dominfo = self.domain_lookup(domid)
+        try:
+            return xc.shadow_mem_control(dominfo.getDomid(), mb=mb)
+        except Exception, ex:
+            raise XendError(str(ex))
+
     def domain_sched_credit_get(self, domid):
         """Get credit scheduler parameters for a domain.
         """
index 3bc69981e860729be97084426ff47564db271a35..ab0554fccd99d31bc4718ccf6861c8b7b9a11d08 100644 (file)
@@ -30,6 +30,7 @@ import string
 import time
 import threading
 import os
+import math
 
 import xen.lowlevel.xc
 from xen.util import asserts
@@ -126,16 +127,17 @@ VM_CONFIG_PARAMS = [
 # don't come out of xc in the same form as they are specified in the config
 # file, so those are handled separately.
 ROUNDTRIPPING_CONFIG_ENTRIES = [
-    ('uuid',       str),
-    ('vcpus',      int),
-    ('vcpu_avail', int),
-    ('cpu_weight', float),
-    ('memory',     int),
-    ('maxmem',     int),
-    ('bootloader', str),
+    ('uuid',            str),
+    ('vcpus',           int),
+    ('vcpu_avail',      int),
+    ('cpu_weight',      float),
+    ('memory',          int),
+    ('shadow_memory',   int),
+    ('maxmem',          int),
+    ('bootloader',      str),
     ('bootloader_args', str),
-    ('features', str),
-    ('localtime', int),
+    ('features',        str),
+    ('localtime',       int),
     ]
 
 ROUNDTRIPPING_CONFIG_ENTRIES += VM_CONFIG_PARAMS
@@ -146,12 +148,13 @@ ROUNDTRIPPING_CONFIG_ENTRIES += VM_CONFIG_PARAMS
 # entries written to the store that cannot be reconfigured on-the-fly.
 #
 VM_STORE_ENTRIES = [
-    ('uuid',       str),
-    ('vcpus',      int),
-    ('vcpu_avail', int),
-    ('memory',     int),
-    ('maxmem',     int),
-    ('start_time', float),
+    ('uuid',          str),
+    ('vcpus',         int),
+    ('vcpu_avail',    int),
+    ('memory',        int),
+    ('shadow_memory', int),
+    ('maxmem',        int),
+    ('start_time',    float),
     ]
 
 VM_STORE_ENTRIES += VM_CONFIG_PARAMS
@@ -572,6 +575,7 @@ class XendDomainInfo:
             defaultInfo('vcpu_avail',   lambda: (1 << self.info['vcpus']) - 1)
 
             defaultInfo('memory',       lambda: 0)
+            defaultInfo('shadow_memory', lambda: 0)
             defaultInfo('maxmem',       lambda: 0)
             defaultInfo('bootloader',   lambda: None)
             defaultInfo('bootloader_args', lambda: None)            
@@ -1280,7 +1284,18 @@ class XendDomainInfo:
             xc.domain_setmaxmem(self.domid, self.info['maxmem'] * 1024)
 
             m = self.image.getDomainMemory(self.info['memory'] * 1024)
-            balloon.free(m)
+
+            # get the domain's shadow memory requirement
+            sm = int(math.ceil(self.image.getDomainShadowMemory(m) / 1024.0))
+            if self.info['shadow_memory'] > sm:
+                sm = self.info['shadow_memory']
+
+            # Make sure there's enough RAM available for the domain
+            balloon.free(m + sm * 1024)
+
+            # Set up the shadow memory
+            sm = xc.shadow_mem_control(self.domid, mb=sm)
+            self.info['shadow_memory'] = sm
 
             init_reservation = self.info['memory'] * 1024
             if os.uname()[4] in ('ia64', 'ppc64'):
index 64fb8109442b0f00b86577859320c9d73c21987d..268462c5812078a46159a2108f4c6869f4c33ba0 100644 (file)
@@ -153,6 +153,12 @@ class ImageHandler:
                 mem_kb += 4*1024;
         return mem_kb
 
+    def getDomainShadowMemory(self, mem_kb):
+        """@return The minimum shadow memory required, in KiB, for a domain 
+        with mem_kb KiB of RAM."""
+        # PV domains don't need any shadow memory
+        return 0
+
     def buildDomain(self):
         """Build the domain. Define in subclass."""
         raise NotImplementedError()
@@ -364,6 +370,17 @@ class HVMImageHandler(ImageHandler):
             extra_pages = int( math.ceil( extra_mb*1024 / page_kb ))
         return mem_kb + extra_pages * page_kb
 
+    def getDomainShadowMemory(self, mem_kb):
+        """@return The minimum shadow memory required, in KiB, for a domain 
+        with mem_kb KiB of RAM."""
+        if os.uname()[4] in ('ia64', 'ppc64'):
+            # Explicit shadow memory is not a concept 
+            return 0
+        else:
+            # 1MB per vcpu plus 4Kib/Mib of RAM.  This is higher than 
+            # the minimum that Xen would allocate if no value were given.
+            return 1024 * self.vm.getVCpuCount() + mem_kb / 256
+
     def register_shutdown_watch(self):
         """ add xen store watch on control/shutdown """
         self.shutdownWatch = xswatch(self.vm.dompath + "/control/shutdown", \
index 549018e209a94449060ad357bc4f0d704fb89468..6416aaab3f75fb008a81d473e703ef74d07a318d 100644 (file)
@@ -158,6 +158,10 @@ gopts.var('maxmem', val='MEMORY',
           fn=set_int, default=None,
           use="Maximum domain memory in MB.")
 
+gopts.var('shadow_memory', val='MEMORY',
+          fn=set_int, default=0,
+          use="Domain shadow memory in MB.")
+
 gopts.var('cpu', val='CPU',
           fn=set_int, default=None,
           use="CPU to run the VCPU0 on.")
@@ -666,8 +670,9 @@ def make_config(vals):
             if v:
                 config.append([n, v])
 
-    map(add_conf, ['name', 'memory', 'maxmem', 'restart', 'on_poweroff',
-                   'on_reboot', 'on_crash', 'vcpus', 'features'])
+    map(add_conf, ['name', 'memory', 'maxmem', 'shadow_memory',
+                   'restart', 'on_poweroff', 'on_reboot', 'on_crash',
+                   'vcpus', 'features'])
 
     if vals.uuid is not None:
         config.append(['uuid', vals.uuid])
index aebee65e9c40c0330a8fcd53526dd688149ba727..e2465942452b8f48bd690dea5f9d0e9795de58ba 100644 (file)
@@ -8,7 +8,6 @@ subdir-$(x86_32) += x86_32
 subdir-$(x86_64) += x86_64
 
 obj-y += apic.o
-obj-y += audit.o
 obj-y += bitops.o
 obj-y += compat.o
 obj-y += delay.o
@@ -41,12 +40,21 @@ obj-y += usercopy.o
 obj-y += x86_emulate.o
 
 ifneq ($(pae),n)
-obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o shadow_guest32pae.o
+obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s3.o shadow2_g3_on_s3.o
 else
-obj-$(x86_32) += shadow32.o
+obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s2.o
 endif
 
-obj-$(x86_64) += shadow.o shadow_public.o shadow_guest32.o shadow_guest32pae.o
+obj-$(x86_64) += shadow2-common.o shadow2_g4_on_s4.o shadow2_g3_on_s3.o \
+                 shadow2_g2_on_s3.o
+
+guest_levels  = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1))))))
+shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1))))))
+shadow2_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \
+                -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1))
+
+shadow2_%.o: shadow2.c $(HDRS) Makefile
+       $(CC) $(CFLAGS) $(call shadow2_defns,$(@F)) -c $< -o $@
 
 obj-$(crash_debug) += gdbstub.o
 
diff --git a/xen/arch/x86/audit.c b/xen/arch/x86/audit.c
deleted file mode 100644 (file)
index bacdb9c..0000000
+++ /dev/null
@@ -1,984 +0,0 @@
-/******************************************************************************
- * arch/x86/audit.c
- * 
- * Copyright (c) 2002-2005 K A Fraser
- * Copyright (c) 2004 Christian Limpach
- * Copyright (c) 2005 Michael A Fetterman
- * 
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-#include <xen/config.h>
-#include <xen/init.h>
-#include <xen/kernel.h>
-#include <xen/lib.h>
-#include <xen/mm.h>
-#include <xen/perfc.h>
-#include <asm/shadow.h>
-#include <asm/page.h>
-#include <asm/flushtlb.h>
-
-/* XXX SMP bug -- these should not be statics... */
-static int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
-static int l1, l2, oos_count, page_count;
-
-#define FILE_AND_LINE 0
-
-#if FILE_AND_LINE
-#define adjust(_p, _a) _adjust((_p), (_a), __FILE__, __LINE__)
-#define ADJUST_EXTRA_ARGS ,const char *file, int line
-#define APRINTK(_f, _a...) printk(_f " %s:%d\n", ## _a, file, line)
-#else
-#define adjust _adjust
-#define ADJUST_EXTRA_ARGS
-#define APRINTK(_f, _a...) printk(_f "\n", ##_a)
-#endif
-
-int audit_adjust_pgtables(struct domain *d, int dir, int noisy)
-{
-    int errors = 0;
-    int shadow_refcounts = !!shadow_mode_refcounts(d);
-    int shadow_enabled = !!shadow_mode_enabled(d);
-
-    int l2limit( unsigned long mfn )
-    {
-
-        if ( shadow_mode_external(d) )
-            return L2_PAGETABLE_ENTRIES;
-
-#ifdef __i386__
-#ifdef CONFIG_X86_PAE
-        /* 32b PAE */
-        if ( (( mfn_to_page(mfn)->u.inuse.type_info & PGT_va_mask ) 
-           >> PGT_va_shift) == 3 )
-            return l2_table_offset(HYPERVISOR_VIRT_START);
-        else
-            return L2_PAGETABLE_ENTRIES;
-#else
-        /* 32b non-PAE */
-        return DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-#endif
-#else
-        /* 64b */
-        return 0; /* XXX x86/64 XXX */
-#endif
-    }
-
-    void _adjust(struct page_info *page, int adjtype ADJUST_EXTRA_ARGS)
-    {
-        int count;
-
-        if ( adjtype )
-        {
-            /* adjust the type count */
-            int tcount = page->u.inuse.type_info & PGT_count_mask;
-            tcount += dir;
-            ttot++;
-
-            if ( page_get_owner(page) == NULL )
-            {
-                APRINTK("adjust(mfn=%lx, dir=%d, adjtype=%d) owner=NULL",
-                        page_to_mfn(page), dir, adjtype);
-                errors++;
-            }
-
-            if ( tcount < 0 )
-            {
-                APRINTK("Audit %d: type count went below zero "
-                        "mfn=%lx t=%" PRtype_info " ot=%x",
-                        d->domain_id, page_to_mfn(page),
-                        page->u.inuse.type_info,
-                        page->tlbflush_timestamp);
-                errors++;
-            }
-            else if ( (tcount & ~PGT_count_mask) != 0 )
-            {
-                APRINTK("Audit %d: type count overflowed "
-                        "mfn=%lx t=%" PRtype_info " ot=%x",
-                        d->domain_id, page_to_mfn(page),
-                        page->u.inuse.type_info,
-                        page->tlbflush_timestamp);
-                errors++;
-            }
-            else
-                page->u.inuse.type_info += dir;
-        }
-
-        /* adjust the general count */
-        count = (page->count_info & PGC_count_mask) + dir;
-        ctot++;
-
-        if ( count < 0 )
-        {
-            APRINTK("Audit %d: general count went below zero "
-                    "mfn=%lx t=%" PRtype_info " ot=%x",
-                    d->domain_id, page_to_mfn(page),
-                    page->u.inuse.type_info,
-                    page->tlbflush_timestamp);
-            errors++;
-        }
-        else if ( (count & ~PGT_count_mask) != 0 )
-        {
-            APRINTK("Audit %d: general count overflowed "
-                    "mfn=%lx t=%" PRtype_info " ot=%x",
-                    d->domain_id, page_to_mfn(page),
-                    page->u.inuse.type_info,
-                    page->tlbflush_timestamp);
-            errors++;
-        }
-        else
-            page->count_info += dir;
-    }
-
-    void adjust_l2_page(unsigned long mfn, int shadow)
-    {
-        l2_pgentry_t *pt = map_domain_page(mfn);
-        int i;
-        u32 page_type;
-
-        for ( i = 0; i < l2limit(mfn); i++ )
-        {
-            if ( l2e_get_flags(pt[i]) & _PAGE_PRESENT )
-            {
-               unsigned long l1mfn = l2e_get_pfn(pt[i]);
-                struct page_info *l1page = mfn_to_page(l1mfn);
-
-                if ( noisy )
-                {
-                    if ( shadow )
-                    {
-                        if ( page_get_owner(l1page) != NULL )
-                        {
-                            printk("L2: Bizarre shadow L1 page mfn=%lx "
-                                   "belonging to a domain %p (id=%d)\n",
-                                   l1mfn,
-                                   page_get_owner(l1page),
-                                   page_get_owner(l1page)->domain_id);
-                            errors++;
-                            continue;
-                        }
-
-                        page_type = l1page->u.inuse.type_info & PGT_type_mask;
-                        if ( page_type != PGT_l1_shadow )
-                        {
-                            printk("Audit %d: [Shadow L2 mfn=%lx i=%x] "
-                                   "Expected Shadow L1 t=%" PRtype_info 
-                                  " mfn=%lx\n",
-                                   d->domain_id, mfn, i,
-                                   l1page->u.inuse.type_info, l1mfn);
-                            errors++;
-                        }
-                    }
-                    else
-                    {
-                        if ( page_get_owner(l1page) != d )
-                        {
-                            printk("L2: Skip bizarre L1 page mfn=%lx "
-                                   "belonging to other dom %p (id=%d)\n",
-                                   l1mfn,
-                                   page_get_owner(l1page),
-                                   (page_get_owner(l1page)
-                                    ? page_get_owner(l1page)->domain_id
-                                    : -1));
-                            errors++;
-                            continue;
-                        }
-
-                        page_type = l1page->u.inuse.type_info & PGT_type_mask;
-                        if ( page_type == PGT_l2_page_table )
-                        {
-                            printk("Audit %d: [%x] Found %s Linear PT "
-                                   "t=%" PRtype_info " mfn=%lx\n",
-                                   d->domain_id, i, (l1mfn==mfn) ? "Self" : "Other",
-                                   l1page->u.inuse.type_info, l1mfn);
-                        }
-                        else if ( page_type != PGT_l1_page_table )
-                        {
-                            printk("Audit %d: [L2 mfn=%lx i=%x] "
-                                   "Expected L1 t=%" PRtype_info " mfn=%lx\n",
-                                   d->domain_id, mfn, i,
-                                   l1page->u.inuse.type_info, l1mfn);
-                            errors++;
-                        }
-                    }
-                }
-
-                adjust(l1page, !shadow);
-            }
-        }
-
-        if ( shadow_mode_translate(d) && !shadow_mode_external(d) )
-        {
-            unsigned long hl2mfn =
-                l2e_get_pfn(pt[l2_table_offset(LINEAR_PT_VIRT_START)]);
-            struct page_info *hl2page = mfn_to_page(hl2mfn);
-            adjust(hl2page, 0);
-        }
-
-        unmap_domain_page(pt);
-    }
-
-    void adjust_hl2_page(unsigned long hl2mfn)
-    {
-        l2_pgentry_t *pt = map_domain_page(hl2mfn);
-        int i;
-
-        for ( i = 0; i < l2limit(hl2mfn); i++ )
-        {
-            if ( l2e_get_flags(pt[i]) & _PAGE_PRESENT )
-            {
-                unsigned long mfn = l2e_get_pfn(pt[i]);
-                struct page_info *gpage = mfn_to_page(mfn);
-
-                if ( mfn < 0x100 )
-                {
-                    lowmem_mappings++;
-                    continue;
-                }
-
-                if ( !mfn_valid(mfn) )
-                {
-                    io_mappings++;
-                    continue;
-                }
-
-                if ( noisy )
-                {
-                    if ( page_get_owner(gpage) != d )
-                    {
-                        printk("Audit %d: [hl2mfn=%lx,i=%x] Skip foreign page "
-                               "dom=%p (id=%d) mfn=%lx c=%08x t=%"
-                              PRtype_info "\n",
-                               d->domain_id, hl2mfn, i,
-                               page_get_owner(gpage),
-                               page_get_owner(gpage)->domain_id,
-                               mfn,
-                               gpage->count_info,
-                               gpage->u.inuse.type_info);
-                        continue;
-                    }
-                }
-                adjust(gpage, 0);
-            }
-        }
-
-        unmap_domain_page(pt);
-    }
-
-    void adjust_l1_page(unsigned long l1mfn)
-    {
-        l1_pgentry_t *pt = map_domain_page(l1mfn);
-        int i;
-
-        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
-        {
-            if ( l1e_get_flags(pt[i]) & _PAGE_PRESENT )
-            {
-                unsigned long mfn = l1e_get_pfn(pt[i]);
-                struct page_info *gpage = mfn_to_page(mfn);
-
-                if ( mfn < 0x100 )
-                {
-                    lowmem_mappings++;
-                    continue;
-                }
-
-                if ( !mfn_valid(mfn) )
-                {
-                    io_mappings++;
-                    continue;
-                }
-
-                if ( noisy )
-                {
-                    if ( l1e_get_flags(pt[i]) & _PAGE_RW )
-                    {
-                        // If it's not a writable page, complain.
-                        //
-                        if ( !((gpage->u.inuse.type_info & PGT_type_mask) ==
-                               PGT_writable_page) )
-                        {
-                            printk("Audit %d: [l1mfn=%lx, i=%x] Illegal RW "
-                                   "t=%" PRtype_info " mfn=%lx\n",
-                                   d->domain_id, l1mfn, i,
-                                   gpage->u.inuse.type_info, mfn);
-                            errors++;
-                        }
-
-                        if ( shadow_refcounts &&
-                             page_is_page_table(gpage) &&
-                             ! page_out_of_sync(gpage) )
-                        {
-                            printk("Audit %d: [l1mfn=%lx, i=%x] Illegal RW of "
-                                   "page table mfn=%lx\n",
-                                   d->domain_id, l1mfn, i, mfn);
-                            errors++;
-                        }
-                    }             
-
-                    if ( page_get_owner(gpage) != d )
-                    {
-                        printk("Audit %d: [l1mfn=%lx,i=%x] Skip foreign page "
-                               "dom=%p (id=%d) mfn=%lx c=%08x t=%" 
-                              PRtype_info "\n",
-                               d->domain_id, l1mfn, i,
-                               page_get_owner(gpage),
-                               page_get_owner(gpage)->domain_id,
-                               mfn,
-                               gpage->count_info,
-                               gpage->u.inuse.type_info);
-                        continue;
-                    }
-                }
-
-                adjust(gpage, (l1e_get_flags(pt[i]) & _PAGE_RW) ? 1 : 0);
-            }
-        }
-
-        unmap_domain_page(pt);
-    }
-
-    void adjust_shadow_tables(void)
-    {
-        struct shadow_status *a;
-        unsigned long smfn, gmfn;
-        struct page_info *page;
-        int i;
-
-        for ( i = 0; i < shadow_ht_buckets; i++ )
-        {
-            a = &d->arch.shadow_ht[i];
-            while ( a && a->gpfn_and_flags )
-            {
-                gmfn = gmfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
-                smfn = a->smfn;
-                page = mfn_to_page(smfn);
-
-                switch ( a->gpfn_and_flags & PGT_type_mask ) {
-                case PGT_writable_pred:
-                    break;
-                case PGT_snapshot:
-                    adjust(mfn_to_page(gmfn), 0);
-                    break;
-                case PGT_l1_shadow:
-                    adjust(mfn_to_page(gmfn), 0);
-                    if ( shadow_refcounts )
-                        adjust_l1_page(smfn);
-                    if ( page->u.inuse.type_info & PGT_pinned )
-                        adjust(page, 0);
-                    break;
-                case PGT_hl2_shadow:
-                    adjust(mfn_to_page(gmfn), 0);
-                    if ( shadow_refcounts )
-                        adjust_hl2_page(smfn);
-                    if ( page->u.inuse.type_info & PGT_pinned )
-                        adjust(page, 0);
-                    break;
-                case PGT_l2_shadow:
-                    adjust(mfn_to_page(gmfn), 0);
-                    adjust_l2_page(smfn, 1);
-                    if ( page->u.inuse.type_info & PGT_pinned )
-                        adjust(page, 0);
-                    break;
-                default:
-                    BUG();
-                    break;
-                }
-
-                a = a->next;
-            }
-        }
-    }
-
-    void adjust_oos_list(void)
-    {
-        struct out_of_sync_entry *oos;
-
-        if ( (oos = d->arch.out_of_sync) )
-            ASSERT(shadow_enabled);
-
-        while ( oos )
-        {
-            adjust(mfn_to_page(oos->gmfn), 0);
-
-            // Only use entries that have low bits clear...
-            //
-            if ( !(oos->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
-                adjust(mfn_to_page(oos->writable_pl1e >> PAGE_SHIFT), 0);
-
-            if ( oos->snapshot_mfn != SHADOW_SNAPSHOT_ELSEWHERE )
-                adjust(mfn_to_page(oos->snapshot_mfn), 0);
-
-            oos = oos->next;
-            oos_count++;
-        }
-    }
-
-    void adjust_for_pgtbase(void)
-    {
-        struct vcpu *v;
-
-        for_each_vcpu(d, v)
-        {
-            if ( !pagetable_is_null(v->arch.guest_table) )
-                adjust(mfn_to_page(pagetable_get_pfn(v->arch.guest_table)),
-                       !shadow_mode_refcounts(d));
-            if ( !pagetable_is_null(v->arch.shadow_table) )
-                adjust(mfn_to_page(pagetable_get_pfn(v->arch.shadow_table)),
-                       0);
-            if ( v->arch.monitor_shadow_ref )
-                adjust(mfn_to_page(v->arch.monitor_shadow_ref), 0);
-        }
-    }
-
-    void adjust_guest_pages(void)
-    {
-        struct list_head *list_ent = d->page_list.next;
-        struct page_info *page;
-        unsigned long mfn, snapshot_mfn;
-
-        while ( list_ent != &d->page_list )
-        {
-            u32 page_type;
-
-            page = list_entry(list_ent, struct page_info, list);
-            snapshot_mfn = mfn = page_to_mfn(page);
-            page_type = page->u.inuse.type_info & PGT_type_mask;
-
-            BUG_ON(page_get_owner(page) != d);
-
-            page_count++;
-
-            if ( shadow_enabled && !shadow_refcounts &&
-                 page_out_of_sync(page) )
-            {
-                unsigned long gpfn = mfn_to_gmfn(d, mfn);
-                ASSERT( VALID_M2P(gpfn) );
-                snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot);
-                ASSERT( snapshot_mfn );
-            }
-
-            switch ( page_type )
-            {
-            case PGT_l2_page_table:
-                l2++;
-
-                if ( noisy )
-                {
-                    if ( shadow_refcounts )
-                    {
-                        printk("Audit %d: found an L2 guest page "
-                               "mfn=%lx t=%" PRtype_info " c=%08x while in shadow mode\n",
-                               d->domain_id, mfn, page->u.inuse.type_info,
-                               page->count_info);
-                        errors++;
-                    }
-
-                    if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
-                    {
-                        if ( (page->u.inuse.type_info & PGT_validated) !=
-                             PGT_validated )
-                        {
-                            printk("Audit %d: L2 mfn=%lx not validated %"
-                                  PRtype_info "\n",
-                                   d->domain_id, mfn, page->u.inuse.type_info);
-                            errors++;
-                        }
-
-                    }
-                }
-
-                if ( page->u.inuse.type_info & PGT_pinned )
-                    adjust(page, 1);
-
-                if ( page->u.inuse.type_info & PGT_validated )
-                    adjust_l2_page(snapshot_mfn, 0);
-
-                break;
-
-            case PGT_l1_page_table:
-                l1++;
-
-                if ( noisy )
-                {
-                    if ( shadow_refcounts )
-                    {
-                        printk("found an L1 guest page mfn=%lx t=%" 
-                              PRtype_info " c=%08x "
-                               "while in shadow mode\n",
-                               mfn, page->u.inuse.type_info, page->count_info);
-                        errors++;
-                    }
-
-                    if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
-                    {
-                        if ( (page->u.inuse.type_info & PGT_validated) !=
-                             PGT_validated )
-                        {
-                            printk("Audit %d: L1 not validated mfn=%lx t=%"
-                                  PRtype_info "\n",
-                                   d->domain_id, mfn, page->u.inuse.type_info);
-                            errors++;
-                        }
-                    }
-                }
-                
-                if ( page->u.inuse.type_info & PGT_pinned )
-                    adjust(page, 1);
-
-                if ( page->u.inuse.type_info & PGT_validated )
-                    adjust_l1_page(snapshot_mfn);
-
-                break;
-
-            case PGT_gdt_page:
-                ASSERT( !page_out_of_sync(page) );
-                adjust(page, 1);
-                break;
-
-            case PGT_ldt_page:
-                ASSERT( !page_out_of_sync(page) );
-                adjust(page, 1);
-                break;
-
-            case PGT_writable_page:
-                if ( shadow_refcounts )
-                {
-                    // In shadow mode, writable pages can get pinned by
-                    // paravirtualized guests that think they are pinning
-                    // their L1s and/or L2s.
-                    //
-                    if ( page->u.inuse.type_info & PGT_pinned )
-                        adjust(page, 1);
-                }
-            }
-
-            list_ent = page->list.next;
-        }
-    }
-
-    adjust_for_pgtbase();
-
-    adjust_guest_pages();
-
-    if ( shadow_enabled )
-    {
-        adjust_oos_list();
-        adjust_shadow_tables();
-    }
-
-    adjust(virt_to_page(d->shared_info), 1);
-
-    return errors;
-}
-
-
-#ifndef NDEBUG
-
-void audit_pagelist(struct domain *d)
-{
-    struct list_head *list_ent;
-    int xenpages, totpages;
-
-    list_ent = d->xenpage_list.next;
-    for ( xenpages = 0; (list_ent != &d->xenpage_list); xenpages++ )
-    {
-        list_ent = list_ent->next;
-    }
-    list_ent = d->page_list.next;
-    for ( totpages = 0; (list_ent != &d->page_list); totpages++ )
-    {
-        list_ent = list_ent->next;
-    }
-
-    if ( xenpages != d->xenheap_pages ||
-         totpages != d->tot_pages )
-    {
-        printk("ARGH! dom %d: xen=%d %d, pages=%d %d\n", d->domain_id,
-               xenpages, d->xenheap_pages, 
-               totpages, d->tot_pages );
-    }
-}
-
-void _audit_domain(struct domain *d, int flags)
-{
-    int shadow_refcounts = !!shadow_mode_refcounts(d);
-
-    void scan_for_pfn_in_mfn(struct domain *d, unsigned long xmfn,
-                             unsigned long mfn)
-    {
-        struct page_info *page = mfn_to_page(mfn);
-        l1_pgentry_t *pt = map_domain_page(mfn);
-        int i;
-
-        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
-        {
-            if ( (l1e_get_flags(pt[i]) & _PAGE_PRESENT) && 
-                 (l1e_get_pfn(pt[i]) == xmfn) )
-                printk("     found dom=%d mfn=%lx t=%" PRtype_info " c=%08x "
-                       "pt[i=%x]=%" PRIpte "\n",
-                       d->domain_id, mfn, page->u.inuse.type_info,
-                       page->count_info, i, l1e_get_intpte(pt[i]));
-        }
-
-        unmap_domain_page(pt);           
-    }
-
-    void scan_for_pfn_in_grant_table(struct domain *d, unsigned xmfn)
-    {
-        int i;
-        struct active_grant_entry *act = d->grant_table->active;
-
-        spin_lock(&d->grant_table->lock);
-
-        for ( i = 0; i < NR_GRANT_ENTRIES; i++ )
-        {
-            if ( act[i].pin && (act[i].frame == xmfn) )
-            {
-                printk("     found active grant table entry i=%d dom=%d pin=%d\n",
-                       i, act[i].domid, act[i].pin);
-            }
-        }
-
-        spin_unlock(&d->grant_table->lock);
-    }
-
-    void scan_for_pfn(struct domain *d, unsigned long xmfn)
-    {
-        scan_for_pfn_in_grant_table(d, xmfn);
-
-        if ( !shadow_mode_enabled(d) )
-        {
-            struct list_head *list_ent = d->page_list.next;
-            struct page_info *page;
-
-            while ( list_ent != &d->page_list )
-            {
-                page = list_entry(list_ent, struct page_info, list);
-
-                switch ( page->u.inuse.type_info & PGT_type_mask )
-                {
-                case PGT_l1_page_table:
-                case PGT_l2_page_table:
-                    scan_for_pfn_in_mfn(d, xmfn, page_to_mfn(page));
-                    break;
-                default:
-                    break;
-                }
-
-                list_ent = page->list.next;
-            }
-        }
-        else
-        {
-            struct shadow_status *a;
-            int i;
-            
-            for ( i = 0; i < shadow_ht_buckets; i++ )
-            {
-                a = &d->arch.shadow_ht[i];
-                while ( a && a->gpfn_and_flags )
-                {
-                    switch ( a->gpfn_and_flags & PGT_type_mask )
-                    {
-                    case PGT_l1_shadow:
-                    case PGT_l2_shadow:
-                    case PGT_hl2_shadow:
-                        scan_for_pfn_in_mfn(d, xmfn, a->smfn);
-                        break;
-                    case PGT_snapshot:
-                    case PGT_writable_pred:
-                        break;
-                    default:
-                        BUG();
-                        break;
-                    }
-                    a = a->next;
-                }
-            }
-        }
-    }
-
-    void scan_for_pfn_remote(unsigned long xmfn)
-    {
-        struct domain *e;
-        for_each_domain ( e )
-            scan_for_pfn( e, xmfn );
-    } 
-
-    unsigned long mfn;
-    struct list_head *list_ent;
-    struct page_info *page;
-    int errors = 0;
-
-    if ( (d != current->domain) && shadow_mode_translate(d) )
-    {
-        printk("skipping audit domain of translated domain %d "
-               "from other context\n",
-               d->domain_id);
-        return;
-    }
-
-    if ( d != current->domain )
-        domain_pause(d);
-
-    // Maybe we should just be using BIGLOCK?
-    //
-    if ( !(flags & AUDIT_SHADOW_ALREADY_LOCKED) )
-        shadow_lock(d);
-
-    spin_lock(&d->page_alloc_lock);
-
-    audit_pagelist(d);
-
-    /* PHASE 0 */
-
-    list_ent = d->page_list.next;
-    while ( list_ent != &d->page_list )
-    {
-        u32 page_type;
-        unsigned long pfn;
-
-        page = list_entry(list_ent, struct page_info, list);
-        mfn = page_to_mfn(page);
-        page_type = page->u.inuse.type_info & PGT_type_mask;
-
-        BUG_ON(page_get_owner(page) != d);
-
-        if ( (page->u.inuse.type_info & PGT_count_mask) >
-             (page->count_info & PGC_count_mask) )
-        {
-            printk("taf(%" PRtype_info ") > caf(%08x) mfn=%lx\n",
-                   page->u.inuse.type_info, page->count_info, mfn);
-            errors++;
-        }
-
-        if ( shadow_mode_refcounts(d) &&
-             (page_type == PGT_writable_page) &&
-             !(page->u.inuse.type_info & PGT_validated) )
-        {
-            printk("shadow mode writable page not validated mfn=%lx " 
-                  "t=%" PRtype_info  " c=%08x\n",
-                   mfn, page->u.inuse.type_info, page->count_info);
-            errors++;
-        }
-#if 0   /* SYSV shared memory pages plus writeable files. */
-        if ( page_type == PGT_writable_page && 
-             (page->u.inuse.type_info & PGT_count_mask) > 1 )
-        {
-            printk("writeable page with type count >1: "
-                   "mfn=%lx t=%" PRtype_info " c=%08x\n",
-                  mfn,
-                  page->u.inuse.type_info,
-                  page->count_info );
-            errors++;
-            scan_for_pfn_remote(mfn);
-        }
-#endif
-
-        if ( page_type == PGT_none && 
-             (page->u.inuse.type_info & PGT_count_mask) > 0 )
-        {
-            printk("normal page with type count >0: mfn=%lx t=%" PRtype_info " c=%08x\n",
-                  mfn,
-                  page->u.inuse.type_info,
-                  page->count_info );
-            errors++;
-        }
-
-        if ( page_out_of_sync(page) )
-        {
-            if ( !page_is_page_table(page) )
-            {
-                printk("out of sync page mfn=%lx is not a page table\n", mfn);
-                errors++;
-            }
-            pfn = mfn_to_gmfn(d, mfn);
-            if ( !__shadow_status(d, pfn, PGT_snapshot) )
-            {
-                printk("out of sync page mfn=%lx doesn't have a snapshot\n",
-                       mfn);
-                errors++;
-            }
-            if ( shadow_refcounts
-                 ? (page_type != PGT_writable_page)
-                 : !(page_type && (page_type <= PGT_l4_page_table)) )
-            {
-                printk("out of sync page mfn=%lx has strange type "
-                       "t=%" PRtype_info  " c=%08x\n",
-                       mfn, page->u.inuse.type_info, page->count_info);
-                errors++;
-            }
-        }
-
-        /* Use tlbflush_timestamp to store original type_info. */
-        page->tlbflush_timestamp = page->u.inuse.type_info;
-
-        list_ent = page->list.next;
-    }
-
-    /* PHASE 1 */
-    io_mappings = lowmem_mappings = 0;
-
-    errors += audit_adjust_pgtables(d, -1, 1);
-
-    if ( !(flags & AUDIT_QUIET) &&
-         ((io_mappings > 0) || (lowmem_mappings > 0)) )
-        printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
-               d->domain_id, lowmem_mappings, io_mappings);
-
-    /* PHASE 2 */
-
-    list_ent = d->page_list.next;
-    while ( list_ent != &d->page_list )
-    {
-        page = list_entry(list_ent, struct page_info, list);
-        mfn = page_to_mfn(page);
-
-        switch ( page->u.inuse.type_info & PGT_type_mask)
-        {
-        case PGT_l1_page_table:
-        case PGT_l2_page_table:
-        case PGT_l3_page_table:
-        case PGT_l4_page_table:
-            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
-            {
-                printk("Audit %d: type count!=0 t=%" PRtype_info " ot=%x c=%x mfn=%lx\n",
-                       d->domain_id, page->u.inuse.type_info, 
-                       page->tlbflush_timestamp,
-                       page->count_info, mfn);
-                errors++;
-                scan_for_pfn_remote(mfn);
-            }
-            break;
-        case PGT_none:
-        case PGT_writable_page:
-        case PGT_gdt_page:
-        case PGT_ldt_page:
-            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
-            {
-                printk("Audit %d: type count!=0 t=%" PRtype_info " ot=%x c=%x mfn=%lx\n",
-                       d->domain_id, page->u.inuse.type_info, 
-                       page->tlbflush_timestamp,
-                       page->count_info, mfn);
-                //errors++;
-            }
-            break;
-        default:
-            BUG(); // XXX fix me...
-        }
-        
-        if ( (page->count_info & PGC_count_mask) != 1 )
-        {
-            printk("Audit %d: gen count!=1 (c=%x) t=%" PRtype_info " ot=%x mfn=%lx\n",
-                   d->domain_id,
-                   page->count_info,
-                   page->u.inuse.type_info, 
-                   page->tlbflush_timestamp, mfn );
-            //errors++;
-            scan_for_pfn_remote(mfn);
-        }
-
-        list_ent = page->list.next;
-    }
-
-    if ( shadow_mode_enabled(d) )
-    {
-        struct shadow_status *a;
-        struct page_info *page;
-        u32 page_type;
-        int i;
-
-        for ( i = 0; i < shadow_ht_buckets; i++ )
-        {
-            a = &d->arch.shadow_ht[i];
-            while ( a && a->gpfn_and_flags )
-            {
-                page = mfn_to_page(a->smfn);
-                page_type = a->gpfn_and_flags & PGT_type_mask;
-
-                switch ( page_type ) {
-                case PGT_l1_shadow:
-                case PGT_l2_shadow:
-                case PGT_hl2_shadow:
-                case PGT_snapshot:
-                    if ( ((page->u.inuse.type_info & PGT_type_mask) != page_type ) ||
-                         (page->count_info != 0) )
-                    {
-                        printk("Audit %d: shadow page counts wrong "
-                               "mfn=%lx t=%" PRtype_info " c=%08x\n",
-                               d->domain_id, page_to_mfn(page),
-                               page->u.inuse.type_info,
-                               page->count_info);
-                        printk("a->gpfn_and_flags=%"PRIx64"\n",
-                               (u64)a->gpfn_and_flags);
-                        errors++;
-                    }
-                    break;
-                case PGT_writable_pred:
-                    // XXX - nothing to check?
-                    break;
-
-                default:
-                    BUG();
-                    break;
-                }
-
-                a = a->next;
-            }
-        }
-    }
-
-    /* PHASE 3 */
-    ctot = ttot = page_count = l1 = l2 = oos_count = 0;
-
-    audit_adjust_pgtables(d, 1, 0);
-
-#if 0
-    // This covers our sins of trashing the tlbflush_timestamps...
-    //
-    local_flush_tlb();
-#endif
-
-    spin_unlock(&d->page_alloc_lock);
-
-    if ( !(flags & AUDIT_QUIET) )
-        printk("Audit dom%d Done. "
-               "pages=%d oos=%d l1=%d l2=%d ctot=%d ttot=%d\n",
-               d->domain_id, page_count, oos_count, l1, l2, ctot, ttot);
-
-    if ( !(flags & AUDIT_SHADOW_ALREADY_LOCKED) )
-        shadow_unlock(d);
-
-    if ( d != current->domain )
-        domain_unpause(d);
-
-    if ( errors && !(flags & AUDIT_ERRORS_OK) )
-        BUG();
-}
-
-void audit_domains(void)
-{
-    struct domain *d;
-    for_each_domain ( d )
-        audit_domain(d);
-}
-
-void audit_domains_key(unsigned char key)
-{
-    audit_domains();
-}
-#endif
index 214b0c58f88bcd315257e804af966f45c7cc83b7..0038112d63d41974b3d8965854e3b23b728995c6 100644 (file)
@@ -89,7 +89,7 @@ long arch_do_dom0_op(struct dom0_op *op, XEN_GUEST_HANDLE(dom0_op_t) u_dom0_op)
         d = find_domain_by_id(op->u.shadow_control.domain);
         if ( d != NULL )
         {
-            ret = shadow_mode_control(d, &op->u.shadow_control);
+            ret = shadow2_control_op(d, &op->u.shadow_control, u_dom0_op);
             put_domain(d);
             copy_to_guest(u_dom0_op, op, 1);
         } 
index 4dd71b185952ba9c7718f8b79634f4c0f6a231d7..65e4dc4b9c34f8f04ecb5f8f9a21012c826c9416 100644 (file)
@@ -134,13 +134,6 @@ struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
     v->arch.perdomain_ptes =
         d->arch.mm_perdomain_pt + (vcpu_id << GDT_LDT_VCPU_SHIFT);
 
-    v->arch.guest_vtable  = __linear_l2_table;
-    v->arch.shadow_vtable = __shadow_linear_l2_table;
-#if defined(__x86_64__)
-    v->arch.guest_vl3table = __linear_l3_table;
-    v->arch.guest_vl4table = __linear_l4_table;
-#endif
-
     pae_l3_cache_init(&v->arch.pae_l3_cache);
 
     return v;
@@ -155,9 +148,7 @@ int arch_domain_create(struct domain *d)
 {
     l1_pgentry_t gdt_l1e;
     int vcpuid, pdpt_order;
-#ifdef __x86_64__
     int i;
-#endif
 
     pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
     d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
@@ -202,8 +193,12 @@ int arch_domain_create(struct domain *d)
 
 #endif /* __x86_64__ */
 
-    shadow_lock_init(d);
-    INIT_LIST_HEAD(&d->arch.free_shadow_frames);
+    shadow2_lock_init(d);
+    for ( i = 0; i <= SHADOW2_MAX_ORDER; i++ )
+        INIT_LIST_HEAD(&d->arch.shadow2_freelists[i]);
+    INIT_LIST_HEAD(&d->arch.shadow2_p2m_freelist);
+    INIT_LIST_HEAD(&d->arch.shadow2_p2m_inuse);
+    INIT_LIST_HEAD(&d->arch.shadow2_toplevel_shadows);
 
     if ( !is_idle_domain(d) )
     {
@@ -234,6 +229,8 @@ int arch_domain_create(struct domain *d)
 
 void arch_domain_destroy(struct domain *d)
 {
+    shadow2_final_teardown(d);
+
     free_xenheap_pages(
         d->arch.mm_perdomain_pt,
         get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
@@ -328,14 +325,6 @@ int arch_set_info_guest(
         if ( !hvm_initialize_guest_resources(v) )
             return -EINVAL;
     }
-    else if ( shadow_mode_refcounts(d) )
-    {
-        if ( !get_page(mfn_to_page(cr3_pfn), d) )
-        {
-            destroy_gdt(v);
-            return -EINVAL;
-        }
-    }
     else
     {
         if ( !get_page_and_type(mfn_to_page(cr3_pfn), d,
@@ -344,9 +333,16 @@ int arch_set_info_guest(
             destroy_gdt(v);
             return -EINVAL;
         }
-    }
+    }    
 
-    update_pagetables(v);
+    /* Shadow2: make sure the domain has enough shadow memory to
+     * boot another vcpu */
+    if ( shadow2_mode_enabled(d) 
+         && d->arch.shadow2_total_pages < shadow2_min_acceptable_pages(d) )
+    {
+        destroy_gdt(v);
+        return -ENOMEM;
+    }
 
     if ( v->vcpu_id == 0 )
         update_domain_wallclock_time(d);
@@ -354,6 +350,11 @@ int arch_set_info_guest(
     /* Don't redo final setup */
     set_bit(_VCPUF_initialised, &v->vcpu_flags);
 
+    if ( shadow2_mode_enabled(d) )
+        shadow2_update_paging_modes(v);
+
+    update_cr3(v);
+
     return 0;
 }
 
@@ -669,7 +670,6 @@ static void __context_switch(void)
             loaddebug(&n->arch.guest_context, 6);
             loaddebug(&n->arch.guest_context, 7);
         }
-
         n->arch.ctxt_switch_to(n);
     }
 
@@ -927,29 +927,34 @@ void domain_relinquish_resources(struct domain *d)
     /* Drop the in-use references to page-table bases. */
     for_each_vcpu ( d, v )
     {
-        if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
+        /* Drop ref to guest_table (from new_guest_cr3(), svm/vmx cr3 handling,
+         * or sh2_update_paging_modes()) */
+        pfn = pagetable_get_pfn(v->arch.guest_table);
+        if ( pfn != 0 )
         {
-            if ( !shadow_mode_refcounts(d) )
-                put_page_type(mfn_to_page(pfn));
-            put_page(mfn_to_page(pfn));
-
+            if ( shadow2_mode_refcounts(d) )
+                put_page(mfn_to_page(pfn));
+            else
+                put_page_and_type(mfn_to_page(pfn));
             v->arch.guest_table = pagetable_null();
         }
 
-        if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
+#ifdef __x86_64__
+        /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
+        pfn = pagetable_get_pfn(v->arch.guest_table_user);
+        if ( pfn != 0 )
         {
-            if ( !shadow_mode_refcounts(d) )
-                put_page_type(mfn_to_page(pfn));
-            put_page(mfn_to_page(pfn));
-
+            put_page_and_type(mfn_to_page(pfn));
             v->arch.guest_table_user = pagetable_null();
         }
+#endif
     }
 
     if ( d->vcpu[0] && hvm_guest(d->vcpu[0]) )
         hvm_relinquish_guest_resources(d);
 
-    shadow_mode_disable(d);
+    /* Tear down shadow mode stuff. */
+    shadow2_teardown(d);
 
     /*
      * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
@@ -964,26 +969,23 @@ void domain_relinquish_resources(struct domain *d)
 
     /* Free page used by xen oprofile buffer */
     free_xenoprof_pages(d);
-
 }
 
 void arch_dump_domain_info(struct domain *d)
 {
-    if ( shadow_mode_enabled(d) )
+    if ( shadow2_mode_enabled(d) )
     {
-        printk("    shadow mode: ");
-        if ( shadow_mode_refcounts(d) )
+        printk("    shadow2 mode: ");
+        if ( d->arch.shadow2_mode & SHM2_enable )
+            printk("enabled ");
+        if ( shadow2_mode_refcounts(d) )
             printk("refcounts ");
-        if ( shadow_mode_write_all(d) )
-            printk("write_all ");
-        if ( shadow_mode_log_dirty(d) )
+        if ( shadow2_mode_log_dirty(d) )
             printk("log_dirty ");
-        if ( shadow_mode_translate(d) )
+        if ( shadow2_mode_translate(d) )
             printk("translate ");
-        if ( shadow_mode_external(d) )
+        if ( shadow2_mode_external(d) )
             printk("external ");
-        if ( shadow_mode_wr_pt_pte(d) )
-            printk("wr_pt_pte ");
         printk("\n");
     }
 }
index a1d95f77c68c08177057444941d6d7d5ecf1492d..5d270336fcb18389f23f9c0ddc3f95107f0cbe41 100644 (file)
@@ -683,8 +683,11 @@ int construct_dom0(struct domain *d,
     for ( i = 1; i < opt_dom0_max_vcpus; i++ )
         (void)alloc_vcpu(d, i, i);
 
-    /* Set up monitor table */
-    update_pagetables(v);
+    /* Set up CR3 value for write_ptbase */
+    if ( shadow2_mode_enabled(v->domain) )
+        shadow2_update_paging_modes(v);
+    else
+        update_cr3(v);
 
     /* Install the new page tables. */
     local_irq_disable();
@@ -796,10 +799,8 @@ int construct_dom0(struct domain *d,
     new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
 
     if ( opt_dom0_shadow )
-    {
-        shadow_mode_enable(d, SHM_enable);
-        update_pagetables(v);
-    }
+        if ( shadow2_test_enable(d) == 0 ) 
+            shadow2_update_paging_modes(v);
 
     if ( supervisor_mode_kernel )
     {
index fd4b69423b5c3cde0eef0005e2d42bc05e580ade..6ffbf751f91808a34f41118e920a169d77ded8bd 100644 (file)
@@ -30,6 +30,7 @@
 #include <xen/hypercall.h>
 #include <xen/guest_access.h>
 #include <xen/event.h>
+#include <xen/shadow.h>
 #include <asm/current.h>
 #include <asm/e820.h>
 #include <asm/io.h>
 #include <asm/spinlock.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
-#include <asm/shadow.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
 #include <public/sched.h>
 #include <public/hvm/ioreq.h>
 #include <public/version.h>
@@ -61,7 +58,7 @@ struct hvm_function_table hvm_funcs;
 static void hvm_zap_mmio_range(
     struct domain *d, unsigned long pfn, unsigned long nr_pfn)
 {
-    unsigned long i, val = INVALID_MFN;
+    unsigned long i;
 
     ASSERT(d == current->domain);
 
@@ -70,7 +67,8 @@ static void hvm_zap_mmio_range(
         if ( pfn + i >= 0xfffff )
             break;
 
-        __copy_to_user(&phys_to_machine_mapping[pfn + i], &val, sizeof (val));
+        if ( VALID_MFN(gmfn_to_mfn(d, pfn + i)) )
+            guest_remove_page(d, pfn + i);
     }
 }
 
@@ -262,11 +260,13 @@ void hvm_setup_platform(struct domain* d)
     if ( !hvm_guest(v) || (v->vcpu_id != 0) )
         return;
 
+#if 0 /* SHADOW2 does not have this */
     if ( shadow_direct_map_init(d) == 0 )
     {
         printk("Can not allocate shadow direct map for HVM domain.\n");
         domain_crash_synchronous();
     }
+#endif
 
     hvm_zap_iommu_pages(d);
 
@@ -380,6 +380,8 @@ void hvm_hlt(unsigned long rflags)
  */
 int hvm_copy(void *buf, unsigned long vaddr, int size, int dir)
 {
+    struct vcpu *v = current;
+    unsigned long gfn;
     unsigned long mfn;
     char *addr;
     int count;
@@ -389,10 +391,9 @@ int hvm_copy(void *buf, unsigned long vaddr, int size, int dir)
         if (count > size)
             count = size;
 
-        if (hvm_paging_enabled(current))
-            mfn = gva_to_mfn(vaddr);
-        else
-            mfn = get_mfn_from_gpfn(vaddr >> PAGE_SHIFT);
+        gfn = shadow2_gva_to_gfn(v, vaddr);
+        mfn = mfn_x(sh2_vcpu_gfn_to_mfn(v, gfn));
+
         if (mfn == INVALID_MFN)
             return 0;
 
@@ -545,7 +546,7 @@ void hvm_do_hypercall(struct cpu_user_regs *pregs)
         return;
     }
 
-    if ( current->domain->arch.ops->guest_paging_levels == PAGING_L4 )
+    if ( current->arch.shadow2->guest_levels == 4 )
     {
         pregs->rax = hvm_hypercall64_table[pregs->rax](pregs->rdi,
                                                        pregs->rsi,
index f1bfd4c47909d7da5a548bdcd4f62860d28defe0..920e7786a0eaf956ce4dd2ec27ff182bf5028dcf 100644 (file)
@@ -21,7 +21,7 @@
 #include <xen/config.h>
 #include <xen/types.h>
 #include <xen/mm.h>
-#include <asm/shadow.h>
+#include <xen/shadow.h>
 #include <xen/domain_page.h>
 #include <asm/page.h>
 #include <xen/event.h>
@@ -35,9 +35,6 @@
 #include <xen/lib.h>
 #include <xen/sched.h>
 #include <asm/current.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
 
 #define DECODE_success  1
 #define DECODE_failure  0
@@ -724,7 +721,7 @@ void send_pio_req(struct cpu_user_regs *regs, unsigned long port,
 
     if (pvalid) {
         if (hvm_paging_enabled(current))
-            p->u.pdata = (void *) gva_to_gpa(value);
+            p->u.data = shadow2_gva_to_gpa(current, value);
         else
             p->u.pdata = (void *) value; /* guest VA == guest PA */
     } else
@@ -774,7 +771,7 @@ void send_mmio_req(
 
     if (pvalid) {
         if (hvm_paging_enabled(v))
-            p->u.pdata = (void *) gva_to_gpa(value);
+            p->u.data = shadow2_gva_to_gpa(v, value);
         else
             p->u.pdata = (void *) value; /* guest VA == guest PA */
     } else
index f7ae00937eb2c6afdbb9d9c6101033cf59ff857b..c6b3e813d506266addfe570a71357c33856e9335 100644 (file)
 #include <xen/irq.h>
 #include <xen/softirq.h>
 #include <xen/hypercall.h>
+#include <xen/domain_page.h>
 #include <asm/current.h>
 #include <asm/io.h>
-#include <asm/shadow.h>
+#include <asm/shadow2.h>
 #include <asm/regs.h>
 #include <asm/cpufeature.h>
 #include <asm/processor.h>
 #include <asm/hvm/svm/emulate.h>
 #include <asm/hvm/svm/vmmcall.h>
 #include <asm/hvm/svm/intr.h>
-#include <asm/shadow.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
 #include <public/sched.h>
 
 #define SVM_EXTRA_DEBUG
@@ -414,7 +411,7 @@ static int svm_realmode(struct vcpu *v)
     return (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE);
 }
 
-static int svm_instruction_length(struct vcpu *v)
+int svm_guest_x86_mode(struct vcpu *v)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     unsigned long cr0 = vmcb->cr0, eflags = vmcb->rflags, mode;
@@ -423,10 +420,20 @@ static int svm_instruction_length(struct vcpu *v)
         mode = vmcb->cs.attributes.fields.l ? 8 : 4;
     else
         mode = (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE) ? 2 : 4;
-    return svm_instrlen(guest_cpu_user_regs(), mode);
+    return mode;
 }
 
-static unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
+int svm_instruction_length(struct vcpu *v)
+{
+    return svm_instrlen(guest_cpu_user_regs(), svm_guest_x86_mode(v));
+}
+
+void svm_update_host_cr3(struct vcpu *v)
+{
+    /* SVM doesn't have a HOST_CR3 equivalent to update. */
+}
+
+unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
 {
     switch ( num )
     {
@@ -436,6 +443,8 @@ static unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
         return v->arch.hvm_svm.cpu_cr2;
     case 3:
         return v->arch.hvm_svm.cpu_cr3;
+    case 4:
+        return v->arch.hvm_svm.cpu_shadow_cr4;
     default:
         BUG();
     }
@@ -526,8 +535,6 @@ static void svm_init_hypercall_page(struct domain *d, void *hypercall_page)
 }
 
 
-
-
 int svm_dbg_on = 0;
 
 static inline int svm_do_debugout(unsigned long exit_code)
@@ -647,6 +654,11 @@ static void svm_load_cpu_guest_regs(
     svm_load_cpu_user_regs(v, regs);
 }
 
+int svm_long_mode_enabled(struct vcpu *v)
+{
+    return SVM_LONG_GUEST(v);
+}
+
 
 
 static void arch_svm_do_launch(struct vcpu *v) 
@@ -726,7 +738,6 @@ static void svm_ctxt_switch_to(struct vcpu *v)
 static void svm_final_setup_guest(struct vcpu *v)
 {
     struct domain *d = v->domain;
-    struct vcpu *vc;
 
     v->arch.schedule_tail    = arch_svm_do_launch;
     v->arch.ctxt_switch_from = svm_ctxt_switch_from;
@@ -735,9 +746,12 @@ static void svm_final_setup_guest(struct vcpu *v)
     if ( v != d->vcpu[0] )
         return;
 
-    /* Initialize monitor page table */
-    for_each_vcpu( d, vc )
-        vc->arch.monitor_table = pagetable_null();
+    if ( !shadow2_mode_external(d) )
+    {
+        DPRINTK("Can't init HVM for dom %u vcpu %u: "
+                "not in shadow2 external mode\n", d->domain_id, v->vcpu_id);
+        domain_crash(d);
+    }
 
     /* 
      * Required to do this once per domain
@@ -745,13 +759,6 @@ static void svm_final_setup_guest(struct vcpu *v)
      */
     memset(&d->shared_info->evtchn_mask[0], 0xff, 
            sizeof(d->shared_info->evtchn_mask));       
-
-    /* 
-     * Put the domain in shadow mode even though we're going to be using
-     * the shared 1:1 page table initially. It shouldn't hurt 
-     */
-    shadow_mode_enable(d, SHM_enable|SHM_refcounts|
-                       SHM_translate|SHM_external|SHM_wr_pt_pte);
 }
 
 
@@ -809,9 +816,13 @@ int start_svm(void)
 
     hvm_funcs.realmode = svm_realmode;
     hvm_funcs.paging_enabled = svm_paging_enabled;
+    hvm_funcs.long_mode_enabled = svm_long_mode_enabled;
+    hvm_funcs.guest_x86_mode = svm_guest_x86_mode;
     hvm_funcs.instruction_length = svm_instruction_length;
     hvm_funcs.get_guest_ctrl_reg = svm_get_ctrl_reg;
 
+    hvm_funcs.update_host_cr3 = svm_update_host_cr3;
+    
     hvm_funcs.stts = svm_stts;
     hvm_funcs.set_tsc_offset = svm_set_tsc_offset;
 
@@ -834,7 +845,6 @@ static void svm_relinquish_guest_resources(struct domain *d)
             continue;
 
         destroy_vmcb(&v->arch.hvm_svm);
-        free_monitor_pagetable(v);
         kill_timer(&v->arch.hvm_vcpu.hlt_timer);
         if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) ) 
         {
@@ -851,8 +861,6 @@ static void svm_relinquish_guest_resources(struct domain *d)
 
     if ( d->arch.hvm_domain.buffered_io_va )
         unmap_domain_page_global((void *)d->arch.hvm_domain.buffered_io_va);
-
-    shadow_direct_map_clean(d);
 }
 
 
@@ -894,7 +902,6 @@ static int svm_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
     unsigned long eip;
-    unsigned long gpa; /* FIXME: PAE */
     int result;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 
@@ -907,43 +914,7 @@ static int svm_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
             va, eip, (unsigned long)regs->error_code);
 //#endif
 
-    if ( !svm_paging_enabled(v) )
-    {
-        if ( shadow_direct_map_fault(va, regs) ) 
-            return 1;
-
-        handle_mmio(va, va);
-        return 1;
-    }
-
-
-    gpa = gva_to_gpa(va);
-
-    /* Use 1:1 page table to identify MMIO address space */
-    if (mmio_space(gpa))
-    {
-        /* No support for APIC */
-        if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000)
-        { 
-            int inst_len;
-            inst_len = svm_instruction_length(v);
-            if (inst_len == -1)
-            {
-                printf("%s: INST_LEN - Unable to decode properly\n", __func__);
-                domain_crash_synchronous();
-            }
-
-            __update_guest_eip(vmcb, inst_len);
-
-            return 1;
-        }
-
-        handle_mmio(va, gpa);
-
-        return 1;
-    }
-    
-    result = shadow_fault(va, regs);
+    result = shadow2_fault(va, regs); 
 
     if( result ) {
         /* Let's make sure that the Guest TLB is flushed */
@@ -1035,19 +1006,12 @@ static void svm_vmexit_do_cpuid(struct vmcb_struct *vmcb, unsigned long input,
             clear_bit(X86_FEATURE_APIC, &edx);
         }
 
-#if CONFIG_PAGING_LEVELS < 3
-        clear_bit(X86_FEATURE_PAE, &edx);
-        clear_bit(X86_FEATURE_PSE, &edx);
-        clear_bit(X86_FEATURE_PSE36, &edx);
-#else
-        if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
-        {
-            if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
-                clear_bit(X86_FEATURE_PAE, &edx);
-            clear_bit(X86_FEATURE_PSE, &edx);
-            clear_bit(X86_FEATURE_PSE36, &edx);
-        }
+#if CONFIG_PAGING_LEVELS >= 3
+        if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
 #endif
+            clear_bit(X86_FEATURE_PAE, &edx);
+        clear_bit(X86_FEATURE_PSE36, &edx);
+
         /* Clear out reserved bits. */
         ecx &= ~SVM_VCPU_CPUID_L1_ECX_RESERVED;
         edx &= ~SVM_VCPU_CPUID_L1_EDX_RESERVED;
@@ -1097,23 +1061,12 @@ static void svm_vmexit_do_cpuid(struct vmcb_struct *vmcb, unsigned long input,
         clear_bit(X86_FEATURE_SYSCALL & 31, &edx);
 #endif
 
-#if CONFIG_PAGING_LEVELS < 3
-        clear_bit(X86_FEATURE_NX & 31, &edx);
-        clear_bit(X86_FEATURE_PAE, &edx);
-        clear_bit(X86_FEATURE_PSE, &edx);
-        clear_bit(X86_FEATURE_PSE36, &edx);
-#else
-        if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
-        {
-            if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
-            {
-                clear_bit(X86_FEATURE_NX & 31, &edx);
-                clear_bit(X86_FEATURE_PAE, &edx);
-            }
-            clear_bit(X86_FEATURE_PSE, &edx);
-            clear_bit(X86_FEATURE_PSE36, &edx);
-        }
+
+#if CONFIG_PAGING_LEVELS >= 3
+        if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
 #endif
+            clear_bit(X86_FEATURE_PAE, &edx);
+        clear_bit(X86_FEATURE_PSE36, &edx);
 
         /* Make SVM feature invisible to the guest. */
         clear_bit(X86_FEATURE_SVME & 31, &ecx);
@@ -1555,6 +1508,7 @@ static int svm_set_cr0(unsigned long value)
     unsigned long mfn;
     int paging_enabled;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+    unsigned long old_base_mfn;
   
     ASSERT(vmcb);
 
@@ -1600,54 +1554,21 @@ static int svm_set_cr0(unsigned long value)
             set_bit(SVM_CPU_STATE_LMA_ENABLED,
                     &v->arch.hvm_svm.cpu_state);
             vmcb->efer |= (EFER_LMA | EFER_LME);
-            if (!shadow_set_guest_paging_levels(v->domain, PAGING_L4) )
-            {
-                printk("Unsupported guest paging levels\n");
-                domain_crash_synchronous(); /* need to take a clean path */
-            }
         }
-        else
 #endif  /* __x86_64__ */
-        {
-#if CONFIG_PAGING_LEVELS >= 3
-            /* seems it's a 32-bit or 32-bit PAE guest */
-            if ( test_bit(SVM_CPU_STATE_PAE_ENABLED,
-                        &v->arch.hvm_svm.cpu_state) )
-            {
-                /* The guest enables PAE first and then it enables PG, it is
-                 * really a PAE guest */
-                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
-                {
-                    printk("Unsupported guest paging levels\n");
-                    domain_crash_synchronous();
-                }
-            }
-            else
-            {
-                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) )
-                {
-                    printk("Unsupported guest paging levels\n");
-                    domain_crash_synchronous(); /* need to take a clean path */
-                }
-            }
-#endif
-        }
 
         /* Now arch.guest_table points to machine physical. */
+        old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
         v->arch.guest_table = pagetable_from_pfn(mfn);
-        update_pagetables(v);
+        if ( old_base_mfn )
+            put_page(mfn_to_page(old_base_mfn));
+        shadow2_update_paging_modes(v);
 
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", 
                 (unsigned long) (mfn << PAGE_SHIFT));
 
+        vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; 
         set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
-        vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table);
-
-        /* arch->shadow_table should hold the next CR3 for shadow */
-        HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx\n", 
-                    v->arch.hvm_svm.cpu_cr3, mfn);
-
-        return 1;
     }
 
     if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
@@ -1667,17 +1588,16 @@ static int svm_set_cr0(unsigned long value)
             svm_inject_exception(v, TRAP_gp_fault, 1, 0);
             return 0;
         }
-
-        clear_all_shadow_status( v->domain );
+        shadow2_update_paging_modes(v);
+        vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
         set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
-        vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
     }
     else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
     {
         /* we should take care of this kind of situation */
-        clear_all_shadow_status(v->domain);
+        shadow2_update_paging_modes(v);
+        vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
         set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
-        vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
     }
 
     return 1;
@@ -1786,7 +1706,7 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
             mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
             if (mfn != pagetable_get_pfn(v->arch.guest_table))
                 __hvm_bug(regs);
-            shadow_sync_all(v->domain);
+            shadow2_update_cr3(v);
         }
         else 
         {
@@ -1812,14 +1732,10 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
             /*
              * arch.shadow_table should now hold the next CR3 for shadow
              */
-#if CONFIG_PAGING_LEVELS >= 3
-            if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 )
-                shadow_sync_all(v->domain);
-#endif
             v->arch.hvm_svm.cpu_cr3 = value;
-            update_pagetables(v);
+            update_cr3(v);
+            vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; 
             HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
-            vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table);
         }
         break;
     }
@@ -1839,12 +1755,6 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
 #if CONFIG_PAGING_LEVELS >= 3
                 unsigned long mfn, old_base_mfn;
 
-                if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
-                {
-                    printk("Unsupported guest paging levels\n");
-                    domain_crash_synchronous(); /* need to take a clean path */
-                }
-
                 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
                                     v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT)) ||
                      !get_page(mfn_to_page(mfn), v->domain) )
@@ -1853,21 +1763,20 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
                     domain_crash_synchronous(); /* need to take a clean path */
                 }
 
-                old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
-                if ( old_base_mfn )
-                    put_page(mfn_to_page(old_base_mfn));
-
                 /*
                  * Now arch.guest_table points to machine physical.
                  */
 
+                old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
                 v->arch.guest_table = pagetable_from_pfn(mfn);
-                update_pagetables(v);
+                if ( old_base_mfn )
+                    put_page(mfn_to_page(old_base_mfn));
+                shadow2_update_paging_modes(v);
 
                 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
                             (unsigned long) (mfn << PAGE_SHIFT));
 
-                vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table);
+                vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; 
 
                 /*
                  * arch->shadow_table should hold the next CR3 for shadow
@@ -1876,33 +1785,6 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
                 HVM_DBG_LOG(DBG_LEVEL_VMMU, 
                             "Update CR3 value = %lx, mfn = %lx",
                             v->arch.hvm_svm.cpu_cr3, mfn);
-#endif
-            }
-            else
-            {
-                /*  The guest is a 64 bit or 32-bit PAE guest. */
-#if CONFIG_PAGING_LEVELS >= 3
-                if ( (v->domain->arch.ops != NULL) &&
-                        v->domain->arch.ops->guest_paging_levels == PAGING_L2)
-                {
-                    /* Seems the guest first enables PAE without enabling PG,
-                     * it must enable PG after that, and it is a 32-bit PAE
-                     * guest */
-
-                    if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3))
-                    {
-                        printk("Unsupported guest paging levels\n");
-                        domain_crash_synchronous();
-                    }                   
-                }
-                else
-                {
-                    if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4))
-                    {
-                        printk("Unsupported guest paging levels\n");
-                        domain_crash_synchronous();
-                    }
-                }
 #endif
             }
         }
@@ -1926,7 +1808,7 @@ static int mov_to_cr(int gpreg, int cr, struct cpu_user_regs *regs)
         if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE))
         {
             set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
-            shadow_sync_all(v->domain);
+            shadow2_update_paging_modes(v);
         }
         break;
     }
@@ -2267,7 +2149,7 @@ void svm_handle_invlpg(const short invlpga, struct cpu_user_regs *regs)
 
     /* Overkill, we may not this */
     set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
-    shadow_invlpg(v, g_vaddr);
+    shadow2_invlpg(v, g_vaddr);
 }
 
 
@@ -2638,7 +2520,7 @@ void walk_shadow_and_guest_pt(unsigned long gva)
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     unsigned long gpa;
 
-    gpa = gva_to_gpa( gva );
+    gpa = shadow2_gva_to_gpa(current, gva);
     printk( "gva = %lx, gpa=%lx, gCR3=%x\n", gva, gpa, (u32)vmcb->cr3 );
     if( !svm_paging_enabled(v) || mmio_space(gpa) )
        return;
@@ -2662,8 +2544,12 @@ void walk_shadow_and_guest_pt(unsigned long gva)
     __copy_from_user(&gpte, &linear_pg_table[ l1_linear_offset(gva) ],
                      sizeof(gpte) );
     printk( "G-PTE = %x, flags=%x\n", gpte.l1, l1e_get_flags(gpte) );
-    __copy_from_user( &spte, &phys_to_machine_mapping[ l1e_get_pfn( gpte ) ],
+
+    BUG(); // need to think about this, and convert usage of
+           // phys_to_machine_mapping to use pagetable format...
+    __copy_from_user( &spte, &phys_to_machine_mapping[ l1e_get_pfn( gpte ) ], 
                       sizeof(spte) );
+
     printk( "S-PTE = %x, flags=%x\n", spte.l1, l1e_get_flags(spte));
 }
 #endif /* SVM_WALK_GUEST_PAGES */
@@ -2704,7 +2590,8 @@ asmlinkage void svm_vmexit_handler(struct cpu_user_regs regs)
 
     if (svm_dbg_on && exit_reason == VMEXIT_EXCEPTION_PF) 
     {
-        if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2)))
+        if (svm_paging_enabled(v) && 
+            !mmio_space(shadow2_gva_to_gpa(current, vmcb->exitinfo2)))
         {
             printk("I%08ld,ExC=%s(%d),IP=%x:%llx,I1=%llx,I2=%llx,INT=%llx, "
                    "gpa=%llx\n", intercepts_counter,
@@ -2713,7 +2600,7 @@ asmlinkage void svm_vmexit_handler(struct cpu_user_regs regs)
                    (unsigned long long) vmcb->exitinfo1,
                    (unsigned long long) vmcb->exitinfo2,
                    (unsigned long long) vmcb->exitintinfo.bytes,
-            (unsigned long long) gva_to_gpa( vmcb->exitinfo2 ) );
+            (unsigned long long) shadow2_gva_to_gpa(current, vmcb->exitinfo2));
         }
         else 
         {
@@ -2757,7 +2644,7 @@ asmlinkage void svm_vmexit_handler(struct cpu_user_regs regs)
         && ( ( vmcb->exitinfo2 == vmcb->rip )
         || vmcb->exitintinfo.bytes) )
     {
-       if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2)))     
+       if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2)))
            walk_shadow_and_guest_pt( vmcb->exitinfo2 );
     }
 #endif
index 349381e3ecc82abcc28dcee382ddb9966c33c881..82f7195e73712f003fb40ca42ccf760116ada5a0 100644 (file)
@@ -380,8 +380,8 @@ void svm_do_launch(struct vcpu *v)
         printk("%s: phys_table   = %lx\n", __func__, pt);
     }
 
-    /* At launch we always use the phys_table */
-    vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
+    /* Set cr3 from hw_cr3 even when guest-visible paging is not enabled */
+    vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; 
 
     if (svm_dbg_on) 
     {
index 7ebca8969326a87abc1c3c3a91e79dada1f02813..9cb27656c3d165708637a12e3b072ff83e52ab1f 100644 (file)
@@ -21,7 +21,8 @@
 #include <xen/types.h>
 #include <xen/mm.h>
 #include <xen/xmalloc.h>
-#include <asm/shadow.h>
+#include <xen/shadow.h>
+#include <xen/domain_page.h>
 #include <asm/page.h>
 #include <xen/event.h>
 #include <xen/trace.h>
index ebd8a42f6876e89e4c8b49581140eb4d4d1e93ad..75de5f49ea37477f2901e9d8c991da3642b3af85 100644 (file)
 #include <asm/flushtlb.h>
 #include <xen/event.h>
 #include <xen/kernel.h>
-#include <asm/shadow.h>
 #include <xen/keyhandler.h>
-
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
+#include <asm/shadow2.h>
 
 static int vmcs_size;
 static int vmcs_order;
@@ -238,7 +234,7 @@ static void vmx_set_host_env(struct vcpu *v)
 
 static void vmx_do_launch(struct vcpu *v)
 {
-/* Update CR3, GDT, LDT, TR */
+/* Update CR3, CR0, CR4, GDT, LDT, TR */
     unsigned int  error = 0;
     unsigned long cr0, cr4;
 
@@ -276,8 +272,11 @@ static void vmx_do_launch(struct vcpu *v)
     error |= __vmwrite(GUEST_TR_BASE, 0);
     error |= __vmwrite(GUEST_TR_LIMIT, 0xff);
 
-    __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
-    __vmwrite(HOST_CR3, pagetable_get_paddr(v->arch.monitor_table));
+    shadow2_update_paging_modes(v);
+    printk("%s(): GUEST_CR3<=%08lx, HOST_CR3<=%08lx\n",
+           __func__, v->arch.hvm_vcpu.hw_cr3, v->arch.cr3);
+    __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
+    __vmwrite(HOST_CR3, v->arch.cr3);
 
     v->arch.schedule_tail = arch_vmx_do_resume;
 
index 658ee8ae73524908561c71d93faef4d183f273b6..0233f26595e86439bdeee02f7f1d58b345a778a2 100644 (file)
@@ -26,9 +26,9 @@
 #include <xen/softirq.h>
 #include <xen/domain_page.h>
 #include <xen/hypercall.h>
+#include <xen/perfc.h>
 #include <asm/current.h>
 #include <asm/io.h>
-#include <asm/shadow.h>
 #include <asm/regs.h>
 #include <asm/cpufeature.h>
 #include <asm/processor.h>
 #include <asm/hvm/vmx/vmx.h>
 #include <asm/hvm/vmx/vmcs.h>
 #include <asm/hvm/vmx/cpu.h>
-#include <asm/shadow.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
+#include <asm/shadow2.h>
 #include <public/sched.h>
 #include <public/hvm/ioreq.h>
 #include <asm/hvm/vpic.h>
@@ -69,11 +66,16 @@ static int vmx_initialize_guest_resources(struct vcpu *v)
     if ( v->vcpu_id != 0 )
         return 1;
 
-    for_each_vcpu ( d, vc )
+    if ( !shadow2_mode_external(d) )
     {
-        /* Initialize monitor page table */
-        vc->arch.monitor_table = pagetable_null();
+        DPRINTK("Can't init HVM for dom %u vcpu %u: "
+                "not in shadow2 external mode\n", 
+                d->domain_id, v->vcpu_id);
+        domain_crash(d);
+    }
 
+    for_each_vcpu ( d, vc )
+    {
         memset(&vc->arch.hvm_vmx, 0, sizeof(struct arch_vmx_struct));
 
         if ( (rc = vmx_create_vmcs(vc)) != 0 )
@@ -107,6 +109,7 @@ static int vmx_initialize_guest_resources(struct vcpu *v)
 
         vc->arch.hvm_vmx.io_bitmap_a = io_bitmap_a;
         vc->arch.hvm_vmx.io_bitmap_b = io_bitmap_b;
+
     }
 
     /*
@@ -116,11 +119,6 @@ static int vmx_initialize_guest_resources(struct vcpu *v)
     memset(&d->shared_info->evtchn_mask[0], 0xff,
            sizeof(d->shared_info->evtchn_mask));
 
-    /* Put the domain in shadow mode even though we're going to be using
-     * the shared 1:1 page table initially. It shouldn't hurt */
-    shadow_mode_enable(
-        d, SHM_enable|SHM_refcounts|SHM_translate|SHM_external|SHM_wr_pt_pte);
-
     return 1;
 }
 
@@ -133,7 +131,6 @@ static void vmx_relinquish_guest_resources(struct domain *d)
         vmx_destroy_vmcs(v);
         if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
             continue;
-        free_monitor_pagetable(v);
         kill_timer(&v->arch.hvm_vcpu.hlt_timer);
         if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) )
         {
@@ -153,8 +150,6 @@ static void vmx_relinquish_guest_resources(struct domain *d)
 
     if ( d->arch.hvm_domain.buffered_io_va )
         unmap_domain_page_global((void *)d->arch.hvm_domain.buffered_io_va);
-
-    shadow_direct_map_clean(d);
 }
 
 #ifdef __x86_64__
@@ -595,14 +590,6 @@ static void vmx_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs)
     vmx_vmcs_exit(v);
 }
 
-static int vmx_realmode(struct vcpu *v)
-{
-    unsigned long rflags;
-
-    __vmread(GUEST_RFLAGS, &rflags);
-    return rflags & X86_EFLAGS_VM;
-}
-
 static int vmx_instruction_length(struct vcpu *v)
 {
     unsigned long inst_len;
@@ -622,6 +609,8 @@ static unsigned long vmx_get_ctrl_reg(struct vcpu *v, unsigned int num)
         return v->arch.hvm_vmx.cpu_cr2;
     case 3:
         return v->arch.hvm_vmx.cpu_cr3;
+    case 4:
+        return v->arch.hvm_vmx.cpu_shadow_cr4;
     default:
         BUG();
     }
@@ -753,9 +742,13 @@ static void vmx_setup_hvm_funcs(void)
 
     hvm_funcs.realmode = vmx_realmode;
     hvm_funcs.paging_enabled = vmx_paging_enabled;
+    hvm_funcs.long_mode_enabled = vmx_long_mode_enabled;
+    hvm_funcs.guest_x86_mode = vmx_guest_x86_mode;
     hvm_funcs.instruction_length = vmx_instruction_length;
     hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
 
+    hvm_funcs.update_host_cr3 = vmx_update_host_cr3;
+
     hvm_funcs.stts = vmx_stts;
     hvm_funcs.set_tsc_offset = vmx_set_tsc_offset;
 
@@ -855,53 +848,25 @@ static void inline __update_guest_eip(unsigned long inst_len)
     __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
 }
 
-
 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
 {
-    unsigned long gpa; /* FIXME: PAE */
     int result;
 
 #if 0 /* keep for debugging */
     {
-        unsigned long eip;
+        unsigned long eip, cs;
 
+        __vmread(GUEST_CS_BASE, &cs);
         __vmread(GUEST_RIP, &eip);
         HVM_DBG_LOG(DBG_LEVEL_VMMU,
-                    "vmx_do_page_fault = 0x%lx, eip = %lx, error_code = %lx",
-                    va, eip, (unsigned long)regs->error_code);
+                    "vmx_do_page_fault = 0x%lx, cs_base=%lx, "
+                    "eip = %lx, error_code = %lx\n",
+                    va, cs, eip, (unsigned long)regs->error_code);
     }
 #endif
 
-    if ( !vmx_paging_enabled(current) )
-    {
-        /* construct 1-to-1 direct mapping */
-        if ( shadow_direct_map_fault(va, regs) ) 
-            return 1;
-
-        handle_mmio(va, va);
-        TRACE_VMEXIT (2,2);
-        return 1;
-    }
-    gpa = gva_to_gpa(va);
-
-    /* Use 1:1 page table to identify MMIO address space */
-    if ( mmio_space(gpa) ){
-        struct vcpu *v = current;
-        /* No support for APIC */
-        if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000) { 
-            u32 inst_len;
-            __vmread(VM_EXIT_INSTRUCTION_LEN, &(inst_len));
-            __update_guest_eip(inst_len);
-            return 1;
-        }
-        TRACE_VMEXIT (2,2);
-        /* in the case of MMIO, we are more interested in gpa than in va */
-        TRACE_VMEXIT (4,gpa);
-        handle_mmio(va, gpa);
-        return 1;
-    }
+    result = shadow2_fault(va, regs);
 
-    result = shadow_fault(va, regs);
     TRACE_VMEXIT (2,result);
 #if 0
     if ( !result )
@@ -972,23 +937,11 @@ static void vmx_vmexit_do_cpuid(struct cpu_user_regs *regs)
                 clear_bit(X86_FEATURE_APIC, &edx);
             }
     
-#if CONFIG_PAGING_LEVELS < 3
-            edx &= ~(bitmaskof(X86_FEATURE_PAE)  |
-                     bitmaskof(X86_FEATURE_PSE)  |
-                     bitmaskof(X86_FEATURE_PSE36));
-#else
-            if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
-            {
-                if ( v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
-                    clear_bit(X86_FEATURE_PSE36, &edx);
-                else
-                {
-                    clear_bit(X86_FEATURE_PAE, &edx);
-                    clear_bit(X86_FEATURE_PSE, &edx);
-                    clear_bit(X86_FEATURE_PSE36, &edx);
-                }
-            }
+#if CONFIG_PAGING_LEVELS >= 3
+            if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
 #endif
+                clear_bit(X86_FEATURE_PAE, &edx);
+            clear_bit(X86_FEATURE_PSE36, &edx);
 
             ebx &= NUM_THREADS_RESET_MASK;  
 
@@ -1086,7 +1039,7 @@ static void vmx_vmexit_do_invlpg(unsigned long va)
      * We do the safest things first, then try to update the shadow
      * copying from guest
      */
-    shadow_invlpg(v, va);
+    shadow2_invlpg(v, va);
 }
 
 
@@ -1307,11 +1260,8 @@ vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
 
     error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
 
-    if (!vmx_paging_enabled(v)) {
-        HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
-        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
+    if (!vmx_paging_enabled(v))
         goto skip_cr3;
-    }
 
     if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
         /*
@@ -1325,7 +1275,6 @@ vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
             domain_crash_synchronous();
             return 0;
         }
-        shadow_sync_all(v->domain);
     } else {
         /*
          * If different, make a shadow. Check if the PDBR is valid
@@ -1348,13 +1297,17 @@ vmx_world_restore(struct vcpu *v, struct vmx_assist_context *c)
          * arch.shadow_table should now hold the next CR3 for shadow
          */
         v->arch.hvm_vmx.cpu_cr3 = c->cr3;
-        update_pagetables(v);
-        HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
-        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
     }
 
  skip_cr3:
 
+    shadow2_update_paging_modes(v);
+    if (!vmx_paging_enabled(v))
+        HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
+    else
+        HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
+    __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
+
     error |= __vmread(CR4_READ_SHADOW, &old_cr4);
     error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
     error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
@@ -1485,6 +1438,7 @@ static int vmx_set_cr0(unsigned long value)
     int paging_enabled;
     unsigned long vm_entry_value;
     unsigned long old_cr0;
+    unsigned long old_base_mfn;
 
     /*
      * CR0: We don't want to lose PE and PG.
@@ -1514,7 +1468,8 @@ static int vmx_set_cr0(unsigned long value)
             v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
              !get_page(mfn_to_page(mfn), v->domain) )
         {
-            printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
+            printk("Invalid CR3 value = %lx (mfn=%lx)\n", 
+                   v->arch.hvm_vmx.cpu_cr3, mfn);
             domain_crash_synchronous(); /* need to take a clean path */
         }
 
@@ -1539,51 +1494,22 @@ static int vmx_set_cr0(unsigned long value)
             __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
             vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
             __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
-
-            if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4) )
-            {
-                printk("Unsupported guest paging levels\n");
-                domain_crash_synchronous(); /* need to take a clean path */
-            }
         }
-        else
-#endif  /* __x86_64__ */
-        {
-#if CONFIG_PAGING_LEVELS >= 3
-            /* seems it's a 32-bit or 32-bit PAE guest */
-
-            if ( test_bit(VMX_CPU_STATE_PAE_ENABLED,
-                        &v->arch.hvm_vmx.cpu_state) )
-            {
-                /* The guest enables PAE first and then it enables PG, it is
-                 * really a PAE guest */
-                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
-                {
-                    printk("Unsupported guest paging levels\n");
-                    domain_crash_synchronous();
-                }
-            }
-            else
-            {
-                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) )
-                {
-                    printk("Unsupported guest paging levels\n");
-                    domain_crash_synchronous(); /* need to take a clean path */
-                }
-            }
 #endif
-        }
 
         /*
          * Now arch.guest_table points to machine physical.
          */
+        old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
         v->arch.guest_table = pagetable_from_pfn(mfn);
-        update_pagetables(v);
+        if (old_base_mfn)
+            put_page(mfn_to_page(old_base_mfn));
+        shadow2_update_paging_modes(v);
 
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
                     (unsigned long) (mfn << PAGE_SHIFT));
 
-        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
+        __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
         /*
          * arch->shadow_table should hold the next CR3 for shadow
          */
@@ -1625,7 +1551,6 @@ static int vmx_set_cr0(unsigned long value)
             }
         }
 
-        clear_all_shadow_status(v->domain);
         if ( vmx_assist(v, VMX_ASSIST_INVOKE) ) {
             set_bit(VMX_CPU_STATE_ASSIST_ENABLED, &v->arch.hvm_vmx.cpu_state);
             __vmread(GUEST_RIP, &eip);
@@ -1651,9 +1576,8 @@ static int vmx_set_cr0(unsigned long value)
     }
     else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
     {
-        /* we should take care of this kind of situation */
-        clear_all_shadow_status(v->domain);
-        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
+        __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
+        shadow2_update_paging_modes(v);
     }
 
     return 1;
@@ -1738,7 +1662,7 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
             mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
             if (mfn != pagetable_get_pfn(v->arch.guest_table))
                 __hvm_bug(regs);
-            shadow_sync_all(v->domain);
+            shadow2_update_cr3(v);
         } else {
             /*
              * If different, make a shadow. Check if the PDBR is valid
@@ -1759,16 +1683,11 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
             /*
              * arch.shadow_table should now hold the next CR3 for shadow
              */
-#if CONFIG_PAGING_LEVELS >= 3
-            if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 )
-                shadow_sync_all(v->domain);
-#endif
-
             v->arch.hvm_vmx.cpu_cr3 = value;
-            update_pagetables(v);
+            update_cr3(v);
             HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx",
                         value);
-            __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
+            __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
         }
         break;
     }
@@ -1786,12 +1705,6 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
 #if CONFIG_PAGING_LEVELS >= 3
                 unsigned long mfn, old_base_mfn;
 
-                if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
-                {
-                    printk("Unsupported guest paging levels\n");
-                    domain_crash_synchronous(); /* need to take a clean path */
-                }
-
                 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
                                     v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
                      !get_page(mfn_to_page(mfn), v->domain) )
@@ -1800,21 +1713,20 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
                     domain_crash_synchronous(); /* need to take a clean path */
                 }
 
-                old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
-                if ( old_base_mfn )
-                    put_page(mfn_to_page(old_base_mfn));
 
                 /*
                  * Now arch.guest_table points to machine physical.
                  */
 
+                old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
                 v->arch.guest_table = pagetable_from_pfn(mfn);
-                update_pagetables(v);
+                if ( old_base_mfn )
+                    put_page(mfn_to_page(old_base_mfn));
 
                 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
                             (unsigned long) (mfn << PAGE_SHIFT));
 
-                __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
+                __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
 
                 /*
                  * arch->shadow_table should hold the next CR3 for shadow
@@ -1822,27 +1734,6 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
 
                 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx",
                             v->arch.hvm_vmx.cpu_cr3, mfn);
-#endif
-            }
-            else
-            {
-                /*  The guest is a 64 bit or 32-bit PAE guest. */
-#if CONFIG_PAGING_LEVELS >= 3
-                if ( (v->domain->arch.ops != NULL) &&
-                        v->domain->arch.ops->guest_paging_levels == PAGING_L2)
-                {
-                    /* Seems the guest first enables PAE without enabling PG,
-                     * it must enable PG after that, and it is a 32-bit PAE
-                     * guest */
-
-                    if ( !shadow_set_guest_paging_levels(v->domain,
-                                                            PAGING_L3) )
-                    {
-                        printk("Unsupported guest paging levels\n");
-                        /* need to take a clean path */
-                        domain_crash_synchronous();
-                    }
-                }
 #endif
             }
         }
@@ -1864,8 +1755,7 @@ static int mov_to_cr(int gp, int cr, struct cpu_user_regs *regs)
          * all TLB entries except global entries.
          */
         if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
-            shadow_sync_all(v->domain);
-
+            shadow2_update_paging_modes(v);
         break;
     }
     default:
index 0c35c9b52d269d895d8cf9645f864563979ea5d3..6c0abad2e2ca199c0f8bc8f93f426e7088485a44 100644 (file)
@@ -137,7 +137,7 @@ static void free_l1_table(struct page_info *page);
 
 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
                         unsigned long type);
-static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
+static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t, unsigned long gl1mfn);
 
 /* Used to defer flushing of memory structures. */
 struct percpu_mm_info {
@@ -274,9 +274,9 @@ void share_xen_page_with_privileged_guests(
 #else
 /*
  * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
- * We cannot safely shadow the idle page table, nor shadow-mode page tables
+ * We cannot safely shadow the idle page table, nor shadow (v1) page tables
  * (detected by lack of an owning domain). As required for correctness, we
- * always shadow PDPTs aboive 4GB.
+ * always shadow PDPTs above 4GB.
  */
 #define l3tab_needs_shadow(mfn)                         \
     (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
@@ -297,17 +297,21 @@ static int __init cache_pae_fixmap_address(void)
 }
 __initcall(cache_pae_fixmap_address);
 
-static void __write_ptbase(unsigned long mfn)
+static DEFINE_PER_CPU(u32, make_cr3_timestamp);
+
+void make_cr3(struct vcpu *v, unsigned long mfn)
+/* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
+ * necessary, and sets v->arch.cr3 to the value to load in CR3. */
 {
     l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
-    struct pae_l3_cache *cache = &current->arch.pae_l3_cache;
+    struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
     unsigned int cpu = smp_processor_id();
 
-    /* Fast path 1: does this mfn need a shadow at all? */
+    /* Fast path: does this mfn need a shadow at all? */
     if ( !l3tab_needs_shadow(mfn) )
     {
-        write_cr3(mfn << PAGE_SHIFT);
-        /* Cache is no longer in use or valid (/after/ write to %cr3). */
+        v->arch.cr3 = mfn << PAGE_SHIFT;
+        /* Cache is no longer in use or valid */
         cache->high_mfn = 0;
         return;
     }
@@ -315,13 +319,6 @@ static void __write_ptbase(unsigned long mfn)
     /* Caching logic is not interrupt safe. */
     ASSERT(!in_irq());
 
-    /* Fast path 2: is this mfn already cached? */
-    if ( cache->high_mfn == mfn )
-    {
-        write_cr3(__pa(cache->table[cache->inuse_idx]));
-        return;
-    }
-
     /* Protects against pae_flush_pgd(). */
     spin_lock(&cache->lock);
 
@@ -330,29 +327,33 @@ static void __write_ptbase(unsigned long mfn)
 
     /* Map the guest L3 table and copy to the chosen low-memory cache. */
     *(fix_pae_highmem_pl1e - cpu) = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
+    /* First check the previous high mapping can't be in the TLB. 
+     * (i.e. have we loaded CR3 since we last did this?) */
+    if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
+        local_flush_tlb_one(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
     highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
     lowmem_l3tab  = cache->table[cache->inuse_idx];
     memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
     *(fix_pae_highmem_pl1e - cpu) = l1e_empty();
+    this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
 
-    /* Install the low-memory L3 table in CR3. */
-    write_cr3(__pa(lowmem_l3tab));
+    v->arch.cr3 = __pa(lowmem_l3tab);
 
     spin_unlock(&cache->lock);
 }
 
 #else /* !CONFIG_X86_PAE */
 
-static void __write_ptbase(unsigned long mfn)
+void make_cr3(struct vcpu *v, unsigned long mfn)
 {
-    write_cr3(mfn << PAGE_SHIFT);
+    v->arch.cr3 = mfn << PAGE_SHIFT;
 }
 
 #endif /* !CONFIG_X86_PAE */
 
 void write_ptbase(struct vcpu *v)
 {
-    __write_ptbase(pagetable_get_pfn(v->arch.monitor_table));
+    write_cr3(v->arch.cr3);
 }
 
 void invalidate_shadow_ldt(struct vcpu *v)
@@ -423,8 +424,6 @@ int map_ldt_shadow_page(unsigned int off)
 
     BUG_ON(unlikely(in_irq()));
 
-    shadow_sync_va(v, gva);
-
     TOGGLE_MODE();
     __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
                      sizeof(l1e));
@@ -440,12 +439,12 @@ int map_ldt_shadow_page(unsigned int off)
 
     res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
 
-    if ( !res && unlikely(shadow_mode_refcounts(d)) )
+    if ( !res && unlikely(shadow2_mode_refcounts(d)) )
     {
-        shadow_lock(d);
-        shadow_remove_all_write_access(d, gmfn, mfn);
+        shadow2_lock(d);
+        shadow2_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0);
         res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
-        shadow_unlock(d);
+        shadow2_unlock(d);
     }
 
     if ( unlikely(!res) )
@@ -513,7 +512,7 @@ get_linear_pagetable(
     struct page_info *page;
     unsigned long pfn;
 
-    ASSERT( !shadow_mode_refcounts(d) );
+    ASSERT( !shadow2_mode_refcounts(d) );
 
     if ( (root_get_flags(re) & _PAGE_RW) )
     {
@@ -576,7 +575,8 @@ get_page_from_l1e(
 
         if ( !iomem_access_permitted(d, mfn, mfn) )
         {
-            MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn);
+            MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx", 
+                    d->domain_id, mfn);
             return 0;
         }
 
@@ -587,9 +587,14 @@ get_page_from_l1e(
         d = dom_io;
     }
 
-    okay = ((l1e_get_flags(l1e) & _PAGE_RW) ?
-            get_page_and_type(page, d, PGT_writable_page) :
-            get_page(page, d));
+    /* Foreign mappings into guests in shadow2 external mode don't
+     * contribute to writeable mapping refcounts.  (This allows the
+     * qemu-dm helper process in dom0 to map the domain's memory without
+     * messing up the count of "real" writable mappings.) */
+    okay = (((l1e_get_flags(l1e) & _PAGE_RW) && 
+             !(unlikely(shadow2_mode_external(d) && (d != current->domain))))
+            ? get_page_and_type(page, d, PGT_writable_page)
+            : get_page(page, d));
     if ( !okay )
     {
         MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
@@ -610,8 +615,6 @@ get_page_from_l2e(
 {
     int rc;
 
-    ASSERT(!shadow_mode_refcounts(d));
-
     if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
         return 1;
 
@@ -641,8 +644,6 @@ get_page_from_l3e(
 {
     int rc;
 
-    ASSERT(!shadow_mode_refcounts(d));
-
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
         return 1;
 
@@ -669,8 +670,6 @@ get_page_from_l4e(
 {
     int rc;
 
-    ASSERT( !shadow_mode_refcounts(d) );
-
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
         return 1;
 
@@ -727,7 +726,10 @@ void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
         domain_crash(d);
     }
 
-    if ( l1e_get_flags(l1e) & _PAGE_RW )
+    /* Remember we didn't take a type-count of foreign writable mappings
+     * to shadow2 external domains */
+    if ( (l1e_get_flags(l1e) & _PAGE_RW) && 
+         !(unlikely((e != d) && shadow2_mode_external(e))) )
     {
         put_page_and_type(page);
     }
@@ -784,7 +786,7 @@ static int alloc_l1_table(struct page_info *page)
     l1_pgentry_t  *pl1e;
     int            i;
 
-    ASSERT(!shadow_mode_refcounts(d));
+    ASSERT(!shadow2_mode_refcounts(d));
 
     pl1e = map_domain_page(pfn);
 
@@ -832,6 +834,8 @@ static int create_pae_xen_mappings(l3_pgentry_t *pl3e)
      *  2. Cannot appear in another page table's L3:
      *     a. alloc_l3_table() calls this function and this check will fail
      *     b. mod_l3_entry() disallows updates to slot 3 in an existing table
+     *
+     * XXX -- this needs revisiting for shadow2_mode_refcount()==true...
      */
     page = l3e_get_page(l3e3);
     BUG_ON(page->u.inuse.type_info & PGT_pinned);
@@ -955,11 +959,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
     l2_pgentry_t  *pl2e;
     int            i;
 
-    /* See the code in shadow_promote() to understand why this is here. */
-    if ( (PGT_base_page_table == PGT_l2_page_table) &&
-         unlikely(shadow_mode_refcounts(d)) )
-        return 1;
-    ASSERT(!shadow_mode_refcounts(d));
+    ASSERT(!shadow2_mode_refcounts(d));
     
     pl2e = map_domain_page(pfn);
 
@@ -1009,11 +1009,7 @@ static int alloc_l3_table(struct page_info *page, unsigned long type)
     l3_pgentry_t  *pl3e;
     int            i;
 
-    /* See the code in shadow_promote() to understand why this is here. */
-    if ( (PGT_base_page_table == PGT_l3_page_table) &&
-         shadow_mode_refcounts(d) )
-        return 1;
-    ASSERT(!shadow_mode_refcounts(d));
+    ASSERT(!shadow2_mode_refcounts(d));
 
 #ifdef CONFIG_X86_PAE
     /*
@@ -1072,11 +1068,7 @@ static int alloc_l4_table(struct page_info *page, unsigned long type)
     unsigned long vaddr;
     int            i;
 
-    /* See the code in shadow_promote() to understand why this is here. */
-    if ( (PGT_base_page_table == PGT_l4_page_table) &&
-         shadow_mode_refcounts(d) )
-        return 1;
-    ASSERT(!shadow_mode_refcounts(d));
+    ASSERT(!shadow2_mode_refcounts(d));
 
     for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
     {
@@ -1183,42 +1175,55 @@ static void free_l4_table(struct page_info *page)
 
 static inline int update_l1e(l1_pgentry_t *pl1e, 
                              l1_pgentry_t  ol1e, 
-                             l1_pgentry_t  nl1e)
+                             l1_pgentry_t  nl1e,
+                             unsigned long gl1mfn,
+                             struct vcpu *v)
 {
+    int rv = 1;
+    if ( unlikely(shadow2_mode_enabled(v->domain)) )
+        shadow2_lock(v->domain);
 #ifndef PTE_UPDATE_WITH_CMPXCHG
-    return !__copy_to_user(pl1e, &nl1e, sizeof(nl1e));
+    rv = (!__copy_to_user(pl1e, &nl1e, sizeof(nl1e)));
 #else
-    intpte_t o = l1e_get_intpte(ol1e);
-    intpte_t n = l1e_get_intpte(nl1e);
-
-    for ( ; ; )
     {
-        if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
+        intpte_t o = l1e_get_intpte(ol1e);
+        intpte_t n = l1e_get_intpte(nl1e);
+        
+        for ( ; ; )
         {
-            MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
-                    ": saw %" PRIpte,
-                    l1e_get_intpte(ol1e),
-                    l1e_get_intpte(nl1e),
-                    o);
-            return 0;
-        }
+            if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
+            {
+                MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
+                        ": saw %" PRIpte,
+                        l1e_get_intpte(ol1e),
+                        l1e_get_intpte(nl1e),
+                        o);
+                rv = 0;
+                break;
+            }
 
-        if ( o == l1e_get_intpte(ol1e) )
-            break;
+            if ( o == l1e_get_intpte(ol1e) )
+                break;
 
-        /* Allowed to change in Accessed/Dirty flags only. */
-        BUG_ON((o ^ l1e_get_intpte(ol1e)) &
-               ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY));
-        ol1e = l1e_from_intpte(o);
+            /* Allowed to change in Accessed/Dirty flags only. */
+            BUG_ON((o ^ l1e_get_intpte(ol1e)) &
+                   ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY));
+            ol1e = l1e_from_intpte(o);
+        }
     }
-
-    return 1;
 #endif
+    if ( unlikely(shadow2_mode_enabled(v->domain)) )
+    {
+        shadow2_validate_guest_entry(v, _mfn(gl1mfn), pl1e);
+        shadow2_unlock(v->domain);    
+    }
+    return rv;
 }
 
 
 /* Update the L1 entry at pl1e to new value nl1e. */
-static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
+static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e, 
+                        unsigned long gl1mfn)
 {
     l1_pgentry_t ol1e;
     struct domain *d = current->domain;
@@ -1226,9 +1231,6 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
     if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
         return 0;
 
-    if ( unlikely(shadow_mode_refcounts(d)) )
-        return update_l1e(pl1e, ol1e, nl1e);
-
     if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
     {
         if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
@@ -1239,13 +1241,13 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
         }
 
         /* Fast path for identical mapping, r/w and presence. */
-        if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT))
-            return update_l1e(pl1e, ol1e, nl1e);
+        if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
+            return update_l1e(pl1e, ol1e, nl1e, gl1mfn, current);
 
         if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
             return 0;
         
-        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) )
         {
             put_page_from_l1e(nl1e, d);
             return 0;
@@ -1253,7 +1255,7 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
     }
     else
     {
-        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) )
             return 0;
     }
 
@@ -1262,9 +1264,9 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
 }
 
 #ifndef PTE_UPDATE_WITH_CMPXCHG
-#define UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; })
+#define _UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; })
 #else
-#define UPDATE_ENTRY(_t,_p,_o,_n) ({                            \
+#define _UPDATE_ENTRY(_t,_p,_o,_n) ({                            \
     for ( ; ; )                                                 \
     {                                                           \
         intpte_t __o = cmpxchg((intpte_t *)(_p),                \
@@ -1279,6 +1281,18 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
     }                                                           \
     1; })
 #endif
+#define UPDATE_ENTRY(_t,_p,_o,_n,_m)  ({                            \
+    int rv;                                                         \
+    if ( unlikely(shadow2_mode_enabled(current->domain)) )          \
+        shadow2_lock(current->domain);                              \
+    rv = _UPDATE_ENTRY(_t, _p, _o, _n);                             \
+    if ( unlikely(shadow2_mode_enabled(current->domain)) )          \
+    {                                                               \
+        shadow2_validate_guest_entry(current, _mfn(_m), (_p));      \
+        shadow2_unlock(current->domain);                            \
+    }                                                               \
+    rv;                                                             \
+})
 
 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
 static int mod_l2_entry(l2_pgentry_t *pl2e, 
@@ -1309,19 +1323,19 @@ static int mod_l2_entry(l2_pgentry_t *pl2e,
 
         /* Fast path for identical mapping and presence. */
         if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
-            return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
+            return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn);
 
         if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) ||
              unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) )
             return 0;
 
-        if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
+        if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) )
         {
             put_page_from_l2e(nl2e, pfn);
             return 0;
         }
     }
-    else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
+    else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) )
     {
         return 0;
     }
@@ -1330,7 +1344,6 @@ static int mod_l2_entry(l2_pgentry_t *pl2e,
     return 1;
 }
 
-
 #if CONFIG_PAGING_LEVELS >= 3
 
 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
@@ -1356,7 +1369,7 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
      */
     if ( pgentry_ptr_to_slot(pl3e) >= 3 )
         return 0;
-#endif
+#endif 
 
     if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
         return 0;
@@ -1372,7 +1385,7 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
 
         /* Fast path for identical mapping and presence. */
         if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
-            return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
+            return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn);
 
 #if CONFIG_PAGING_LEVELS >= 4
         if ( unlikely(!l2_backptr(&vaddr, pgentry_ptr_to_slot(pl3e), type)) ||
@@ -1383,15 +1396,15 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
             << L3_PAGETABLE_SHIFT;
         if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
             return 0;
-#endif
+#endif 
 
-        if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
+        if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) )
         {
             put_page_from_l3e(nl3e, pfn);
             return 0;
         }
     }
-    else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
+    else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) )
     {
         return 0;
     }
@@ -1438,19 +1451,19 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
 
         /* Fast path for identical mapping and presence. */
         if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
-            return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
+            return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn);
 
         if ( unlikely(!l3_backptr(&vaddr, pgentry_ptr_to_slot(pl4e), type)) ||
              unlikely(!get_page_from_l4e(nl4e, pfn, current->domain, vaddr)) )
             return 0;
 
-        if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
+        if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) )
         {
             put_page_from_l4e(nl4e, pfn);
             return 0;
         }
     }
-    else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
+    else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) )
     {
         return 0;
     }
@@ -1506,18 +1519,21 @@ void free_page_type(struct page_info *page, unsigned long type)
          */
         this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
 
-        if ( unlikely(shadow_mode_enabled(owner)) )
+        if ( unlikely(shadow2_mode_enabled(owner)
+                 && !shadow2_lock_is_acquired(owner)) )
         {
             /* Raw page tables are rewritten during save/restore. */
-            if ( !shadow_mode_translate(owner) )
+            if ( !shadow2_mode_translate(owner) )
                 mark_dirty(owner, page_to_mfn(page));
 
-            if ( shadow_mode_refcounts(owner) )
+            if ( shadow2_mode_refcounts(owner) )
                 return;
 
             gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
             ASSERT(VALID_M2P(gmfn));
-            remove_shadow(owner, gmfn, type & PGT_type_mask);
+            shadow2_lock(owner);
+            shadow2_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
+            shadow2_unlock(owner);
         }
     }
 
@@ -1573,9 +1589,6 @@ void put_page_type(struct page_info *page)
 
         if ( unlikely((nx & PGT_count_mask) == 0) )
         {
-            /* Record TLB information for flush later. Races are harmless. */
-            page->tlbflush_timestamp = tlbflush_current_time();
-            
             if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
                  likely(nx & PGT_validated) )
             {
@@ -1593,6 +1606,9 @@ void put_page_type(struct page_info *page)
                 x  &= ~PGT_validated;
                 nx &= ~PGT_validated;
             }
+
+            /* Record TLB information for flush later. */
+            page->tlbflush_timestamp = tlbflush_current_time();
         }
         else if ( unlikely((nx & (PGT_pinned|PGT_type_mask|PGT_count_mask)) == 
                            (PGT_pinned|PGT_l1_page_table|1)) )
@@ -1682,7 +1698,7 @@ int get_page_type(struct page_info *page, unsigned long type)
 #endif
                     /* Fixme: add code to propagate va_unknown to subtables. */
                     if ( ((type & PGT_type_mask) >= PGT_l2_page_table) &&
-                         !shadow_mode_refcounts(page_get_owner(page)) )
+                         !shadow2_mode_refcounts(page_get_owner(page)) )
                         return 0;
                     /* This table is possibly mapped at multiple locations. */
                     nx &= ~PGT_va_mask;
@@ -1729,7 +1745,10 @@ int new_guest_cr3(unsigned long mfn)
     int okay;
     unsigned long old_base_mfn;
 
-    if ( shadow_mode_refcounts(d) )
+    if ( hvm_guest(v) && !hvm_paging_enabled(v) )
+        domain_crash_synchronous();
+
+    if ( shadow2_mode_refcounts(d) )
     {
         okay = get_page_from_pagenr(mfn, d);
         if ( unlikely(!okay) )
@@ -1747,7 +1766,7 @@ int new_guest_cr3(unsigned long mfn)
             MEM_LOG("New baseptr %lx: slow path via idle pagetables", mfn);
             old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
             v->arch.guest_table = pagetable_null();
-            update_pagetables(v);
+            update_cr3(v);
             write_cr3(__pa(idle_pg_table));
             if ( old_base_mfn != 0 )
                 put_page_and_type(mfn_to_page(old_base_mfn));
@@ -1769,30 +1788,20 @@ int new_guest_cr3(unsigned long mfn)
     invalidate_shadow_ldt(v);
 
     old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
+
     v->arch.guest_table = pagetable_from_pfn(mfn);
-    update_pagetables(v); /* update shadow_table and monitor_table */
+    update_cr3(v); /* update shadow_table and cr3 fields of vcpu struct */
 
     write_ptbase(v);
 
     if ( likely(old_base_mfn != 0) )
     {
-        if ( shadow_mode_refcounts(d) )
+        if ( shadow2_mode_refcounts(d) )
             put_page(mfn_to_page(old_base_mfn));
         else
             put_page_and_type(mfn_to_page(old_base_mfn));
     }
 
-    /* CR3 also holds a ref to its shadow... */
-    if ( shadow_mode_enabled(d) )
-    {
-        if ( v->arch.monitor_shadow_ref )
-            put_shadow_ref(v->arch.monitor_shadow_ref);
-        v->arch.monitor_shadow_ref =
-            pagetable_get_pfn(v->arch.monitor_table);
-        ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
-        get_shadow_ref(v->arch.monitor_shadow_ref);
-    }
-
     return 1;
 }
 
@@ -1807,8 +1816,6 @@ static void process_deferred_ops(void)
 
     if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
     {
-        if ( shadow_mode_enabled(d) )
-            shadow_sync_all(d);
         if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
             flush_tlb_mask(d->domain_dirty_cpumask);
         else
@@ -1974,7 +1981,7 @@ int do_mmuext_op(
             type = PGT_root_page_table;
 
         pin_page:
-            if ( shadow_mode_refcounts(FOREIGNDOM) )
+            if ( shadow2_mode_refcounts(FOREIGNDOM) )
                 break;
 
             okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
@@ -1996,7 +2003,7 @@ int do_mmuext_op(
             break;
 
         case MMUEXT_UNPIN_TABLE:
-            if ( shadow_mode_refcounts(d) )
+            if ( shadow2_mode_refcounts(d) )
                 break;
 
             if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
@@ -2009,6 +2016,12 @@ int do_mmuext_op(
             {
                 put_page_and_type(page);
                 put_page(page);
+                if ( shadow2_mode_enabled(d) )
+                {
+                    shadow2_lock(d);
+                    shadow2_remove_all_shadows(v, _mfn(mfn));
+                    shadow2_unlock(d);
+                }
             }
             else
             {
@@ -2050,9 +2063,9 @@ int do_mmuext_op(
             break;
     
         case MMUEXT_INVLPG_LOCAL:
-            if ( shadow_mode_enabled(d) )
-                shadow_invlpg(v, op.arg1.linear_addr);
-            local_flush_tlb_one(op.arg1.linear_addr);
+            if ( !shadow2_mode_enabled(d) 
+                 || shadow2_invlpg(v, op.arg1.linear_addr) != 0 )
+                local_flush_tlb_one(op.arg1.linear_addr);
             break;
 
         case MMUEXT_TLB_FLUSH_MULTI:
@@ -2098,7 +2111,7 @@ int do_mmuext_op(
             unsigned long ptr  = op.arg1.linear_addr;
             unsigned long ents = op.arg2.nr_ents;
 
-            if ( shadow_mode_external(d) )
+            if ( shadow2_mode_external(d) )
             {
                 MEM_LOG("ignoring SET_LDT hypercall from external "
                         "domain %u", d->domain_id);
@@ -2171,9 +2184,6 @@ int do_mmu_update(
 
     LOCK_BIGLOCK(d);
 
-    if ( unlikely(shadow_mode_enabled(d)) )
-        check_pagetable(v, "pre-mmu"); /* debug */
-
     if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
     {
         count &= ~MMU_UPDATE_PREEMPTED;
@@ -2248,7 +2258,12 @@ int do_mmu_update(
             case PGT_l3_page_table:
             case PGT_l4_page_table:
             {
-                ASSERT(!shadow_mode_refcounts(d));
+                if ( shadow2_mode_refcounts(d) )
+                {
+                    DPRINTK("mmu update on shadow-refcounted domain!");
+                    break;
+                }
+
                 if ( unlikely(!get_page_type(
                     page, type_info & (PGT_type_mask|PGT_va_mask))) )
                     goto not_a_pt;
@@ -2258,10 +2273,7 @@ int do_mmu_update(
                 case PGT_l1_page_table:
                 {
                     l1_pgentry_t l1e = l1e_from_intpte(req.val);
-                    okay = mod_l1_entry(va, l1e);
-                    if ( okay && unlikely(shadow_mode_enabled(d)) )
-                        shadow_l1_normal_pt_update(
-                            d, req.ptr, l1e, &sh_mapcache);
+                    okay = mod_l1_entry(va, l1e, mfn);
                 }
                 break;
                 case PGT_l2_page_table:
@@ -2269,9 +2281,6 @@ int do_mmu_update(
                     l2_pgentry_t l2e = l2e_from_intpte(req.val);
                     okay = mod_l2_entry(
                         (l2_pgentry_t *)va, l2e, mfn, type_info);
-                    if ( okay && unlikely(shadow_mode_enabled(d)) )
-                        shadow_l2_normal_pt_update(
-                            d, req.ptr, l2e, &sh_mapcache);
                 }
                 break;
 #if CONFIG_PAGING_LEVELS >= 3
@@ -2279,9 +2288,6 @@ int do_mmu_update(
                 {
                     l3_pgentry_t l3e = l3e_from_intpte(req.val);
                     okay = mod_l3_entry(va, l3e, mfn, type_info);
-                    if ( okay && unlikely(shadow_mode_enabled(d)) )
-                        shadow_l3_normal_pt_update(
-                            d, req.ptr, l3e, &sh_mapcache);
                 }
                 break;
 #endif
@@ -2290,9 +2296,6 @@ int do_mmu_update(
                 {
                     l4_pgentry_t l4e = l4e_from_intpte(req.val);
                     okay = mod_l4_entry(va, l4e, mfn, type_info);
-                    if ( okay && unlikely(shadow_mode_enabled(d)) )
-                        shadow_l4_normal_pt_update(
-                            d, req.ptr, l4e, &sh_mapcache);
                 }
                 break;
 #endif
@@ -2308,19 +2311,17 @@ int do_mmu_update(
                 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
                     break;
 
-                if ( shadow_mode_enabled(d) )
-                {
-                    shadow_lock(d);
-                    __mark_dirty(d, mfn);
-                    if ( page_is_page_table(page) && !page_out_of_sync(page) )
-                        shadow_mark_mfn_out_of_sync(v, gmfn, mfn);
-                }
+                if ( unlikely(shadow2_mode_enabled(d)) )
+                    shadow2_lock(d);
 
                 *(intpte_t *)va = req.val;
                 okay = 1;
 
-                if ( shadow_mode_enabled(d) )
-                    shadow_unlock(d);
+                if ( unlikely(shadow2_mode_enabled(d)) )
+                {
+                    shadow2_validate_guest_entry(v, _mfn(mfn), va);
+                    shadow2_unlock(d);
+                }
 
                 put_page_type(page);
             }
@@ -2334,12 +2335,6 @@ int do_mmu_update(
 
         case MMU_MACHPHYS_UPDATE:
 
-            if ( shadow_mode_translate(FOREIGNDOM) )
-            {
-                MEM_LOG("can't mutate m2p table of translate mode guest");
-                break;
-            }
-
             mfn = req.ptr >> PAGE_SHIFT;
             gpfn = req.val;
 
@@ -2349,9 +2344,13 @@ int do_mmu_update(
                 break;
             }
 
-            set_gpfn_from_mfn(mfn, gpfn);
+            if ( shadow2_mode_translate(FOREIGNDOM) )
+                shadow2_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn);
+            else 
+                set_gpfn_from_mfn(mfn, gpfn);
             okay = 1;
 
+            // Mark the new gfn dirty...
             mark_dirty(FOREIGNDOM, mfn);
 
             put_page(mfn_to_page(mfn));
@@ -2382,9 +2381,6 @@ int do_mmu_update(
     if ( unlikely(!guest_handle_is_null(pdone)) )
         copy_to_guest(pdone, &done, 1);
 
-    if ( unlikely(shadow_mode_enabled(d)) )
-        check_pagetable(v, "post-mmu"); /* debug */
-
     UNLOCK_BIGLOCK(d);
     return rc;
 }
@@ -2402,7 +2398,6 @@ static int create_grant_pte_mapping(
     struct domain *d = v->domain;
 
     ASSERT(spin_is_locked(&d->big_lock));
-    ASSERT(!shadow_mode_refcounts(d));
 
     gmfn = pte_addr >> PAGE_SHIFT;
     mfn = gmfn_to_mfn(d, gmfn);
@@ -2418,7 +2413,7 @@ static int create_grant_pte_mapping(
     page = mfn_to_page(mfn);
 
     type_info = page->u.inuse.type_info;
-    if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
+    if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||         
          !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
     {
         MEM_LOG("Grant map attempted to update a non-L1 page");
@@ -2427,28 +2422,22 @@ static int create_grant_pte_mapping(
     }
 
     ol1e = *(l1_pgentry_t *)va;
-    if ( !update_l1e(va, ol1e, _nl1e) )
+    if ( !update_l1e(va, ol1e, _nl1e, mfn, v) )
     {
         put_page_type(page);
         rc = GNTST_general_error;
         goto failed;
     } 
 
-    put_page_from_l1e(ol1e, d);
-
-    if ( unlikely(shadow_mode_enabled(d)) )
-    {
-        struct domain_mmap_cache sh_mapcache;
-        domain_mmap_cache_init(&sh_mapcache);
-        shadow_l1_normal_pt_update(d, pte_addr, _nl1e, &sh_mapcache);
-        domain_mmap_cache_destroy(&sh_mapcache);
-    }
+    if ( !shadow2_mode_refcounts(d) )
+        put_page_from_l1e(ol1e, d);
 
     put_page_type(page);
  
  failed:
     unmap_domain_page(va);
     put_page(page);
+
     return rc;
 }
 
@@ -2462,8 +2451,6 @@ static int destroy_grant_pte_mapping(
     u32 type_info;
     l1_pgentry_t ol1e;
 
-    ASSERT(!shadow_mode_refcounts(d));
-
     gmfn = addr >> PAGE_SHIFT;
     mfn = gmfn_to_mfn(d, gmfn);
 
@@ -2504,7 +2491,9 @@ static int destroy_grant_pte_mapping(
     }
 
     /* Delete pagetable entry. */
-    if ( unlikely(!update_l1e((l1_pgentry_t *)va, ol1e, l1e_empty())) )
+    if ( unlikely(!update_l1e(
+                      (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn, 
+                      d->vcpu[0] /* Change if we go to per-vcpu shadows. */)) )
     {
         MEM_LOG("Cannot delete PTE entry at %p", va);
         put_page_type(page);
@@ -2512,14 +2501,6 @@ static int destroy_grant_pte_mapping(
         goto failed;
     }
 
-    if ( unlikely(shadow_mode_enabled(d)) )
-    {
-        struct domain_mmap_cache sh_mapcache;
-        domain_mmap_cache_init(&sh_mapcache);
-        shadow_l1_normal_pt_update(d, addr, l1e_empty(), &sh_mapcache);
-        domain_mmap_cache_destroy(&sh_mapcache);
-    }
-
     put_page_type(page);
 
  failed:
@@ -2536,31 +2517,22 @@ static int create_grant_va_mapping(
     struct domain *d = v->domain;
     
     ASSERT(spin_is_locked(&d->big_lock));
-    ASSERT(!shadow_mode_refcounts(d));
-
-    /*
-     * This is actually overkill - we don't need to sync the L1 itself,
-     * just everything involved in getting to this L1 (i.e. we need
-     * linear_pg_table[l1_linear_offset(va)] to be in sync)...
-     */
-    __shadow_sync_va(v, va);
 
     pl1e = &linear_pg_table[l1_linear_offset(va)];
 
     if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ||
-         !update_l1e(pl1e, ol1e, _nl1e) )
+         !update_l1e(pl1e, ol1e, _nl1e, 
+                    l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]), v) )
         return GNTST_general_error;
 
-    put_page_from_l1e(ol1e, d);
-
-    if ( unlikely(shadow_mode_enabled(d)) )
-        shadow_do_update_va_mapping(va, _nl1e, v);
+    if ( !shadow2_mode_refcounts(d) )
+        put_page_from_l1e(ol1e, d);
 
     return GNTST_okay;
 }
 
 static int destroy_grant_va_mapping(
-    unsigned long addr, unsigned long frame)
+    unsigned long addr, unsigned long frame, struct domain *d)
 {
     l1_pgentry_t *pl1e, ol1e;
     
@@ -2584,12 +2556,14 @@ static int destroy_grant_va_mapping(
     }
 
     /* Delete pagetable entry. */
-    if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty())) )
+    if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty(), 
+                      l2e_get_pfn(__linear_l2_table[l2_linear_offset(addr)]),
+                      d->vcpu[0] /* Change for per-vcpu shadows */)) )
     {
         MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
         return GNTST_general_error;
     }
-    
+
     return 0;
 }
 
@@ -2597,7 +2571,7 @@ int create_grant_host_mapping(
     unsigned long addr, unsigned long frame, unsigned int flags)
 {
     l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
-        
+
     if ( (flags & GNTMAP_application_map) )
         l1e_add_flags(pte,_PAGE_USER);
     if ( !(flags & GNTMAP_readonly) )
@@ -2613,7 +2587,7 @@ int destroy_grant_host_mapping(
 {
     if ( flags & GNTMAP_contains_pte )
         return destroy_grant_pte_mapping(addr, frame, current->domain);
-    return destroy_grant_va_mapping(addr, frame);
+    return destroy_grant_va_mapping(addr, frame, current->domain);
 }
 
 int steal_page(
@@ -2675,46 +2649,44 @@ int do_update_va_mapping(unsigned long va, u64 val64,
 
     perfc_incrc(calls_to_update_va);
 
-    if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
+    if ( unlikely(!__addr_ok(va) && !shadow2_mode_external(d)) )
         return -EINVAL;
 
-    LOCK_BIGLOCK(d);
-
-    if ( unlikely(shadow_mode_enabled(d)) )
-        check_pagetable(v, "pre-va"); /* debug */
+    if ( unlikely(shadow2_mode_refcounts(d)) )
+    {
+        DPRINTK("Grant op on a shadow-refcounted domain\n");
+        return -EINVAL; 
+    }
 
-    if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
-                                val)) )
-        rc = -EINVAL;
+    LOCK_BIGLOCK(d);
 
-    if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
+    if ( likely(rc == 0) && unlikely(shadow2_mode_enabled(d)) )
     {
         if ( unlikely(this_cpu(percpu_mm_info).foreign &&
-                      (shadow_mode_translate(d) ||
-                       shadow_mode_translate(
+                      (shadow2_mode_translate(d) ||
+                       shadow2_mode_translate(
                            this_cpu(percpu_mm_info).foreign))) )
         {
             /*
              * The foreign domain's pfn's are in a different namespace. There's
-             * not enough information in just a gpte to figure out how to
+             * not enough information in just a gpte to figure out how to   
              * (re-)shadow this entry.
              */
             domain_crash(d);
         }
-    
-        rc = shadow_do_update_va_mapping(va, val, v);
-
-        check_pagetable(v, "post-va"); /* debug */
     }
 
+    if ( unlikely(!mod_l1_entry(
+                      &linear_pg_table[l1_linear_offset(va)], val,
+                      l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]))) )
+        rc = -EINVAL;
+    
     switch ( flags & UVMF_FLUSHTYPE_MASK )
     {
     case UVMF_TLB_FLUSH:
         switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
         {
         case UVMF_LOCAL:
-            if ( unlikely(shadow_mode_enabled(d)) )
-                shadow_sync_all(d);
             local_flush_tlb();
             break;
         case UVMF_ALL:
@@ -2733,9 +2705,9 @@ int do_update_va_mapping(unsigned long va, u64 val64,
         switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
         {
         case UVMF_LOCAL:
-            if ( unlikely(shadow_mode_enabled(d)) )
-                shadow_invlpg(current, va);
-            local_flush_tlb_one(va);
+            if ( !shadow2_mode_enabled(d) 
+                 || (shadow2_invlpg(current, va) != 0) ) 
+                local_flush_tlb_one(va);
             break;
         case UVMF_ALL:
             flush_tlb_one_mask(d->domain_dirty_cpumask, va);
@@ -2808,8 +2780,6 @@ long set_gdt(struct vcpu *v,
     if ( entries > FIRST_RESERVED_GDT_ENTRY )
         return -EINVAL;
 
-    shadow_sync_all(d);
-
     /* Check the pages in the new GDT. */
     for ( i = 0; i < nr_pages; i++ ) {
         mfn = frames[i] = gmfn_to_mfn(d, frames[i]);
@@ -2912,24 +2882,13 @@ long do_update_descriptor(u64 pa, u64 desc)
         break;
     }
 
-    if ( shadow_mode_enabled(dom) )
-    {
-        shadow_lock(dom);
-
-        __mark_dirty(dom, mfn);
-
-        if ( page_is_page_table(page) && !page_out_of_sync(page) )
-            shadow_mark_mfn_out_of_sync(current, gmfn, mfn);
-    }
+    mark_dirty(dom, mfn);
 
     /* All is good so make the update. */
     gdt_pent = map_domain_page(mfn);
     memcpy(&gdt_pent[offset], &d, 8);
     unmap_domain_page(gdt_pent);
 
-    if ( shadow_mode_enabled(dom) )
-        shadow_unlock(dom);
-
     put_page_type(page);
 
     ret = 0; /* success */
@@ -2981,8 +2940,8 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
         default:
             break;
         }
-        
-        if ( !shadow_mode_translate(d) || (mfn == 0) )
+
+        if ( !shadow2_mode_translate(d) || (mfn == 0) )
         {
             put_domain(d);
             return -EINVAL;
@@ -3011,7 +2970,7 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
         guest_physmap_add_page(d, xatp.gpfn, mfn);
 
         UNLOCK_BIGLOCK(d);
-
+        
         put_domain(d);
 
         break;
@@ -3136,7 +3095,8 @@ static int ptwr_emulated_update(
     unsigned long pfn;
     struct page_info *page;
     l1_pgentry_t pte, ol1e, nl1e, *pl1e;
-    struct domain *d = current->domain;
+    struct vcpu *v = current;
+    struct domain *d = v->domain;
 
     /* Aligned access only, thank you. */
     if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) )
@@ -3196,25 +3156,36 @@ static int ptwr_emulated_update(
         return X86EMUL_UNHANDLEABLE;
     }
 
+
     /* Checked successfully: do the update (write or cmpxchg). */
     pl1e = map_domain_page(page_to_mfn(page));
     pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
     if ( do_cmpxchg )
     {
+        if ( shadow2_mode_enabled(d) )
+            shadow2_lock(d);
         ol1e = l1e_from_intpte(old);
         if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
         {
+            if ( shadow2_mode_enabled(d) )
+                shadow2_unlock(d);
             unmap_domain_page(pl1e);
             put_page_from_l1e(nl1e, d);
             return X86EMUL_CMPXCHG_FAILED;
         }
+        if ( unlikely(shadow2_mode_enabled(v->domain)) )
+        {
+            shadow2_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e);
+            shadow2_unlock(v->domain);    
+        }
     }
     else
     {
         ol1e = *pl1e;
-        if ( !update_l1e(pl1e, ol1e, nl1e) )
+        if ( !update_l1e(pl1e, ol1e, nl1e, page_to_mfn(page), v) )
             BUG();
     }
+
     unmap_domain_page(pl1e);
 
     /* Finally, drop the old PTE. */
index ff0589082afb3bf791f46396b4a449dc5a554eee..01782320b3348e6fb2eb91296241e095727cb455 100644 (file)
@@ -532,8 +532,6 @@ void __init __start_xen(multiboot_info_t *mbi)
     if ( opt_watchdog ) 
         watchdog_enable();
 
-    shadow_mode_init();
-
     /* initialize access control security module */
     acm_init(&initrdidx, mbi, initial_images_start);
 
diff --git a/xen/arch/x86/shadow.c b/xen/arch/x86/shadow.c
deleted file mode 100644 (file)
index 88e2ec8..0000000
+++ /dev/null
@@ -1,4150 +0,0 @@
-/******************************************************************************
- * arch/x86/shadow.c
- *
- * Copyright (c) 2005 Michael A Fetterman
- * Based on an earlier implementation by Ian Pratt et al
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-/*
- * Jun Nakajima <jun.nakajima@intel.com>
- * Chengyuan Li <chengyuan.li@intel.com>
- *
- * Extended to support 32-bit PAE and 64-bit guests.
- */
-
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/mm.h>
-#include <xen/domain_page.h>
-#include <asm/shadow.h>
-#include <asm/page.h>
-#include <xen/event.h>
-#include <xen/sched.h>
-#include <xen/trace.h>
-#include <asm/shadow_64.h>
-
-/* Use this to have the compiler remove unnecessary branches */
-#define SH_L1_HAS_NEXT_PAGE (GUEST_L1_PAGETABLE_ENTRIES - L1_PAGETABLE_ENTRIES)
-
-extern void free_shadow_pages(struct domain *d);
-
-#if 0 // this code has not been updated for 32pae & 64 bit modes
-#if SHADOW_DEBUG
-static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn);
-#endif
-#endif
-
-#if CONFIG_PAGING_LEVELS == 3
-static unsigned long shadow_l3_table(
-    struct vcpu *v, unsigned long gpfn, unsigned long gmfn);
-#endif
-
-#if CONFIG_PAGING_LEVELS == 4
-static unsigned long shadow_l4_table(
-    struct vcpu *v, unsigned long gpfn, unsigned long gmfn);
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 3
-static void shadow_map_into_current(struct vcpu *v,
-    unsigned long va, unsigned int from, unsigned int to);
-static inline void validate_bl2e_change( struct domain *d,
-    guest_root_pgentry_t *new_gle_p, pgentry_64_t *shadow_l3, int index);
-static void update_top_level_shadow(struct vcpu *v, unsigned long smfn);
-#endif
-
-/********
-
-There's a per-domain shadow table spin lock which works fine for SMP
-hosts. We don't have to worry about interrupts as no shadow operations
-happen in an interrupt context. It's probably not quite ready for SMP
-guest operation as we have to worry about synchonisation between gpte
-and spte updates. Its possible that this might only happen in a
-hypercall context, in which case we'll probably at have a per-domain
-hypercall lock anyhow (at least initially).
-
-********/
-
-static inline int
-shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
-               unsigned long new_type)
-{
-    struct page_info *page = mfn_to_page(gmfn);
-    int pinned = 0, okay = 1;
-
-    if ( page_out_of_sync(page) )
-    {
-        // Don't know how long ago this snapshot was taken.
-        // Can't trust it to be recent enough.
-        //
-        __shadow_sync_mfn(d, gmfn);
-    }
-
-    if ( !shadow_mode_refcounts(d) )
-        return 1;
-
-    if ( unlikely(page_is_page_table(page)) )
-        return 1;
-
-    FSH_LOG("%s: gpfn=%lx gmfn=%lx nt=%08lx", __func__, gpfn, gmfn, new_type);
-
-    if ( !shadow_remove_all_write_access(d, gpfn, gmfn) )
-    {
-        FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%lx gmfn=%lx",
-                __func__, gpfn, gmfn);
-#if 1 || defined(LIVE_DANGEROUSLY)
-        set_bit(_PGC_page_table, &page->count_info);
-        return 1;
-#endif
-        return 0;
-    }
-
-    // To convert this page to use as a page table, the writable count
-    // should now be zero.  Test this by grabbing the page as an page table,
-    // and then immediately releasing.  This will also deal with any
-    // necessary TLB flushing issues for us.
-    //
-    // The cruft here about pinning doesn't really work right.  This
-    // needs rethinking/rewriting...  Need to gracefully deal with the
-    // TLB flushes required when promoting a writable page, and also deal
-    // with any outstanding (external) writable refs to this page (by
-    // refusing to promote it).  The pinning headache complicates this
-    // code -- it would all get much simpler if we stop using
-    // shadow_lock() and move the shadow code to BIGLOCK().
-    //
-    if ( unlikely(!get_page(page, d)) )
-        BUG(); // XXX -- needs more thought for a graceful failure
-    if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
-    {
-        pinned = 1;
-        put_page_and_type(page);
-    }
-    if ( get_page_type(page, PGT_base_page_table) )
-    {
-        set_bit(_PGC_page_table, &page->count_info);
-        put_page_type(page);
-    }
-    else
-    {
-        printk("shadow_promote: get_page_type failed "
-               "dom%d gpfn=%lx gmfn=%lx t=%08lx\n",
-               d->domain_id, gpfn, gmfn, new_type);
-        okay = 0;
-    }
-
-    // Now put the type back to writable...
-    if ( unlikely(!get_page_type(page, PGT_writable_page)) )
-        BUG(); // XXX -- needs more thought for a graceful failure
-    if ( unlikely(pinned) )
-    {
-        if ( unlikely(test_and_set_bit(_PGT_pinned,
-                                       &page->u.inuse.type_info)) )
-            BUG(); // hmm... someone pinned this again?
-    }
-    else
-        put_page_and_type(page);
-
-    return okay;
-}
-
-
-/*
- * Things in shadow mode that collect get_page() refs to the domain's
- * pages are:
- * - PGC_allocated takes a gen count, just like normal.
- * - A writable page can be pinned (paravirtualized guests may consider
- *   these pages to be L1s or L2s, and don't know the difference).
- *   Pinning a page takes a gen count (but, for domains in shadow mode,
- *   it *doesn't* take a type count)
- * - CR3 grabs a ref to whatever it points at, just like normal.
- * - Shadow mode grabs an initial gen count for itself, as a placehold
- *   for whatever references will exist.
- * - Shadow PTEs that point to a page take a gen count, just like regular
- *   PTEs.  However, they don't get a type count, as get_page_type() is
- *   hardwired to keep writable pages' counts at 1 for domains in shadow
- *   mode.
- * - Whenever we shadow a page, the entry in the shadow hash grabs a
- *   general ref to the page.
- * - Whenever a page goes out of sync, the out of sync entry grabs a
- *   general ref to the page.
- */
-/*
- * page_info fields for pages allocated as shadow pages:
- *
- * All 32 bits of count_info are a simple count of refs to this shadow
- * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
- * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
- * references.
- *
- * u.inuse._domain is left NULL, to prevent accidently allow some random
- * domain from gaining permissions to map this page.
- *
- * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
- * shadowed.
- * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
- * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
- * is currently exists because this is a shadow of a root page, and we
- * don't want to let those disappear just because no CR3 is currently pointing
- * at it.
- *
- * tlbflush_timestamp holds a min & max index of valid page table entries
- * within the shadow page.
- */
-static inline void
-shadow_page_info_init(struct page_info *page,
-                      unsigned long gmfn,
-                      u32 psh_type)
-{
-    ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
-    page->u.inuse.type_info = psh_type | gmfn;
-    page->count_info = 0;
-    page->tlbflush_timestamp = 0;
-}
-
-static inline unsigned long
-alloc_shadow_page(struct domain *d,
-                  unsigned long gpfn, unsigned long gmfn,
-                  u32 psh_type)
-{
-    struct page_info *page;
-    unsigned long smfn, real_gpfn;
-    int pin = 0;
-    void *l1, *lp;
-    u64 index = 0;
-
-    // Currently, we only keep pre-zero'ed pages around for use as L1's...
-    // This will change.  Soon.
-    //
-    if ( psh_type == PGT_l1_shadow )
-    {
-        if ( !list_empty(&d->arch.free_shadow_frames) )
-        {
-            struct list_head *entry = d->arch.free_shadow_frames.next;
-            page = list_entry(entry, struct page_info, list);
-            list_del(entry);
-            perfc_decr(free_l1_pages);
-        }
-        else
-        {
-            if ( SH_L1_HAS_NEXT_PAGE &&
-                 d->arch.ops->guest_paging_levels == PAGING_L2)
-            {
-#if CONFIG_PAGING_LEVELS >= 3
-                /* 
-                 * For 32-bit HVM guest, 2 shadow L1s are required to
-                 * simulate 1 guest L1 So need allocate 2 shadow L1
-                 * pages each time. 
-                 *
-                 * --> Need to avoidalloc_domheap_pages.
-                 */
-                page = alloc_domheap_pages(NULL, SL1_ORDER, 0);
-                if (!page)
-                    goto no_shadow_page;
-
-                l1 = map_domain_page(page_to_mfn(page));
-                memset(l1, 0, PAGE_SIZE);
-                unmap_domain_page(l1);
-
-                l1 = map_domain_page(page_to_mfn(page + 1));
-                memset(l1, 0, PAGE_SIZE);
-                unmap_domain_page(l1);
-
-                /* we'd like to initialize the second continuous page here
-                 * and leave the first page initialization later */
-
-                shadow_page_info_init(page+1, gmfn, psh_type);
-#else
-                page = alloc_domheap_page(NULL);
-                if (!page)
-                    goto no_shadow_page;
-
-                l1 = map_domain_page(page_to_mfn(page));
-                memset(l1, 0, PAGE_SIZE);
-                unmap_domain_page(l1);
-#endif
-            }
-            else
-            {
-                page = alloc_domheap_page(NULL);
-                if (!page)
-                    goto no_shadow_page;
-
-                l1 = map_domain_page(page_to_mfn(page));
-                memset(l1, 0, PAGE_SIZE);
-                unmap_domain_page(l1);
-            }
-        }
-    }
-    else {
-#if CONFIG_PAGING_LEVELS == 2
-        page = alloc_domheap_page(NULL);
-#elif CONFIG_PAGING_LEVELS >= 3
-        if ( d->arch.ops->guest_paging_levels == PAGING_L2 &&
-             psh_type == PGT_l4_shadow )      /* allocated for PAE PDP page */
-            page = alloc_domheap_pages(NULL, 0, MEMF_dma);
-        else if ( d->arch.ops->guest_paging_levels == PAGING_L3 &&
-                  (psh_type == PGT_l3_shadow || psh_type == PGT_l4_shadow) )
-            page = alloc_domheap_pages(NULL, 0, MEMF_dma); /* allocated for PAE PDP page */
-        else
-            page = alloc_domheap_page(NULL);
-#endif
-        if (!page)
-            goto no_shadow_page;
-
-        lp = map_domain_page(page_to_mfn(page));
-        memset(lp, 0, PAGE_SIZE);
-        unmap_domain_page(lp);
-    }
-
-    smfn = page_to_mfn(page);
-
-    shadow_page_info_init(page, gmfn, psh_type);
-
-    switch ( psh_type )
-    {
-    case PGT_l1_shadow:
-        if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
-            goto fail;
-        perfc_incr(shadow_l1_pages);
-        d->arch.shadow_page_count++;
-        break;
-
-    case PGT_l2_shadow:
-        if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
-            goto fail;
-        perfc_incr(shadow_l2_pages);
-        d->arch.shadow_page_count++;
-        if ( PGT_l2_page_table == PGT_root_page_table )
-            pin = 1;
-
-        break;
-
-    case PGT_l3_shadow:
-        if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
-            goto fail;
-        perfc_incr(shadow_l3_pages);
-        d->arch.shadow_page_count++;
-        if ( PGT_l3_page_table == PGT_root_page_table )
-            pin = 1;
-        break;
-
-    case PGT_l4_shadow:
-        real_gpfn = gpfn & PGT_mfn_mask;
-        if ( !shadow_promote(d, real_gpfn, gmfn, psh_type) )
-            goto fail;
-        perfc_incr(shadow_l4_pages);
-        d->arch.shadow_page_count++;
-        if ( PGT_l4_page_table == PGT_root_page_table )
-            pin = 1;
-#if CONFIG_PAGING_LEVELS == 3 & defined (GUEST_PGENTRY_32)
-        /*
-         * We use PGT_l4_shadow for 2-level paging guests on PAE
-         */
-        if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
-            pin = 1;
-#endif
-
-#if CONFIG_PAGING_LEVELS == 3 & defined ( GUEST_32PAE )
-        /*
-         * We use PGT_l4_shadow for 2-level paging guests on PAE
-         */
-        if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
-            pin = 1;
-#endif
-        if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
-            index = get_cr3_idxval(current);
-        break;
-
-#if CONFIG_PAGING_LEVELS >= 3
-    case PGT_fl1_shadow:
-        perfc_incr(shadow_l1_pages);
-        d->arch.shadow_page_count++;
-        break;
-#else
-
-    case PGT_hl2_shadow:
-        // Treat an hl2 as an L1 for purposes of promotion.
-        // For external mode domains, treat them as an L2 for purposes of
-        // pinning.
-        //
-        if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) )
-            goto fail;
-        perfc_incr(hl2_table_pages);
-        d->arch.hl2_page_count++;
-        if ( shadow_mode_external(d) &&
-             (PGT_l2_page_table == PGT_root_page_table) )
-            pin = 1;
-
-        break;
-#endif
-    case PGT_snapshot:
-        perfc_incr(snapshot_pages);
-        d->arch.snapshot_page_count++;
-        break;
-
-    default:
-        printk("Alloc shadow weird page type type=%08x\n", psh_type);
-        BUG();
-        break;
-    }
-
-    // Don't add a new shadow of something that already has a snapshot.
-    //
-    ASSERT( (psh_type == PGT_snapshot) || !mfn_out_of_sync(gmfn) );
-
-    set_shadow_status(d, gpfn, gmfn, smfn, psh_type, index);
-
-    if ( pin )
-        shadow_pin(smfn);
-
-    return smfn;
-
-fail:
-    FSH_LOG("promotion of pfn=%lx mfn=%lx failed!  external gnttab refs?",
-            gpfn, gmfn);
-    if (psh_type == PGT_l1_shadow)
-    {
-        if (d->arch.ops->guest_paging_levels == PAGING_L2)
-        {
-#if CONFIG_PAGING_LEVELS >=3
-            free_domheap_pages(page, SL1_ORDER);
-#else
-            free_domheap_page(page);
-#endif
-        }
-        else
-            free_domheap_page(page);
-    }
-    else
-        free_domheap_page(page);
-
-    return 0;
-
-no_shadow_page:
-    ASSERT(page == NULL);
-    printk("Couldn't alloc shadow page! dom%d count=%d\n",
-           d->domain_id, d->arch.shadow_page_count);
-    printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
-           perfc_value(shadow_l1_pages),
-           perfc_value(shadow_l2_pages),
-           perfc_value(hl2_table_pages),
-           perfc_value(snapshot_pages));
-    /* XXX FIXME: try a shadow flush to free up some memory. */
-    domain_crash_synchronous();
-
-    return 0;
-}
-
-#if CONFIG_PAGING_LEVELS == 2
-static unsigned long
-shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
-                unsigned long smfn)
-{
-    unsigned long hl2mfn;
-    l1_pgentry_t *hl2;
-    int limit;
-
-    ASSERT(PGT_base_page_table == PGT_l2_page_table);
-
-    if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) )
-    {
-        printk("Couldn't alloc an HL2 shadow for pfn=%lx mfn=%lx\n",
-               gpfn, gmfn);
-        BUG(); /* XXX Deal gracefully with failure. */
-    }
-
-    SH_VVLOG("shadow_hl2_table(gpfn=%lx, gmfn=%lx, smfn=%lx) => %lx",
-             gpfn, gmfn, smfn, hl2mfn);
-    perfc_incrc(shadow_hl2_table_count);
-
-    hl2 = map_domain_page(hl2mfn);
-
-    if ( shadow_mode_external(d) )
-        limit = L2_PAGETABLE_ENTRIES;
-    else
-        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-
-    memset(hl2, 0, limit * sizeof(l1_pgentry_t));
-
-    if ( !shadow_mode_external(d) )
-    {
-        memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
-               HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
-
-        // Setup easy access to the GL2, SL2, and HL2 frames.
-        //
-        hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
-            l1e_from_pfn(gmfn, __PAGE_HYPERVISOR);
-        hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
-            l1e_from_pfn(smfn, __PAGE_HYPERVISOR);
-        hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
-            l1e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
-    }
-
-    unmap_domain_page(hl2);
-
-    return hl2mfn;
-}
-
-/*
- * This could take and use a snapshot, and validate the entire page at
- * once, or it could continue to fault in entries one at a time...
- * Might be worth investigating...
- */
-static unsigned long shadow_l2_table(
-    struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
-{
-    unsigned long smfn;
-    l2_pgentry_t *spl2e;
-    struct domain *d = v->domain;
-    int i;
-
-    SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
-
-    perfc_incrc(shadow_l2_table_count);
-
-    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
-    {
-        printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n",
-               gpfn, gmfn);
-        BUG(); /* XXX Deal gracefully with failure. */
-    }
-
-    spl2e = (l2_pgentry_t *)map_domain_page(smfn);
-
-    /* Install hypervisor and 2x linear p.t. mapings. */
-    if ( (PGT_base_page_table == PGT_l2_page_table) &&
-         !shadow_mode_external(d) )
-    {
-        /*
-         * We could proactively fill in PDEs for pages that are already
-         * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
-         * (restriction required for coherence of the accessed bit). However,
-         * we tried it and it didn't help performance. This is simpler.
-         */
-        memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
-
-        /* Install hypervisor and 2x linear p.t. mapings. */
-        memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
-               &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
-               HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
-
-        spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
-            l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
-
-        for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
-            spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
-                l2e_from_page(virt_to_page(page_get_owner(mfn_to_page(gmfn))->
-                                           arch.mm_perdomain_pt) + i,
-                              __PAGE_HYPERVISOR);
-
-        if ( shadow_mode_translate(d) ) // NB: not external
-        {
-            unsigned long hl2mfn;
-
-            spl2e[l2_table_offset(RO_MPT_VIRT_START)] =
-                l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
-                                __PAGE_HYPERVISOR);
-
-            if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
-                hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
-
-            // shadow_mode_translate (but not external) sl2 tables hold a
-            // ref to their hl2.
-            //
-            if ( !get_shadow_ref(hl2mfn) )
-                BUG();
-
-            spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
-                l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
-        }
-        else
-            spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
-                l2e_from_pfn(gmfn, __PAGE_HYPERVISOR);
-    }
-    else
-    {
-        memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));
-    }
-
-    unmap_domain_page(spl2e);
-
-    SH_VLOG("shadow_l2_table(%lx -> %lx)", gmfn, smfn);
-    return smfn;
-}
-#endif /* CONFIG_PAGING_LEVELS == 2 */
-
-static void shadow_map_l1_into_current_l2(unsigned long va)
-{
-    struct vcpu *v = current;
-    struct domain *d = v->domain;
-    l1_pgentry_t *spl1e, *spl1e_next = 0;
-    l2_pgentry_t sl2e;
-    guest_l1_pgentry_t *gpl1e;
-    guest_l2_pgentry_t gl2e = {0};
-    unsigned long gl1pfn, gl1mfn, sl1mfn;
-    int i, init_table = 0;
-
-    __guest_get_l2e(v, va, &gl2e);
-    ASSERT(guest_l2e_get_flags(gl2e) & _PAGE_PRESENT);
-    gl1pfn = l2e_get_pfn(gl2e);
-
-    if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
-    {
-        /* This L1 is NOT already shadowed so we need to shadow it. */
-        SH_VVLOG("4a: l1 not shadowed");
-
-        gl1mfn = gmfn_to_mfn(d, gl1pfn);
-        if ( unlikely(!VALID_MFN(gl1mfn)) )
-        {
-            // Attempt to use an invalid pfn as an L1 page.
-            // XXX this needs to be more graceful!
-            BUG();
-        }
-
-        if ( unlikely(!(sl1mfn =
-                        alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
-        {
-            printk("Couldn't alloc an L1 shadow for pfn=%lx mfn=%lx\n",
-                   gl1pfn, gl1mfn);
-            BUG(); /* XXX Need to deal gracefully with failure. */
-        }
-
-        perfc_incrc(shadow_l1_table_count);
-        init_table = 1;
-    }
-    else
-    {
-        /* This L1 is shadowed already, but the L2 entry is missing. */
-        SH_VVLOG("4b: was shadowed, l2 missing (%lx)", sl1mfn);
-    }
-
-#ifndef NDEBUG
-    {
-        l2_pgentry_t old_sl2e;
-        __shadow_get_l2e(v, va, &old_sl2e);
-        ASSERT(!(l2e_get_flags(old_sl2e) & _PAGE_PRESENT));
-    }
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 3
-    if ( SH_L1_HAS_NEXT_PAGE && 
-         d->arch.ops->guest_paging_levels == PAGING_L2 )
-    {
-        /* for 32-bit HVM guest on 64-bit or PAE host,
-         * need update two L2 entries each time
-         */
-        if ( !get_shadow_ref(sl1mfn))
-            BUG();
-        l2pde_general(d, &gl2e, &sl2e, sl1mfn);
-        __guest_set_l2e(v, va, &gl2e);
-        __shadow_set_l2e(v, va & ~((1<<L2_PAGETABLE_SHIFT_32) - 1), &sl2e);
-        if ( !get_shadow_ref(sl1mfn+1))
-            BUG();
-        sl2e = l2e_empty();
-        l2pde_general(d, &gl2e, &sl2e, sl1mfn+1);
-        __shadow_set_l2e(v,((va & ~((1<<L2_PAGETABLE_SHIFT_32) - 1)) + (1 << L2_PAGETABLE_SHIFT)) , &sl2e);
-    } else
-#endif
-    {
-        if ( !get_shadow_ref(sl1mfn) )
-            BUG();
-        l2pde_general(d, &gl2e, &sl2e, sl1mfn);
-        __guest_set_l2e(v, va, &gl2e);
-        __shadow_set_l2e(v, va , &sl2e);
-    }
-
-    if ( init_table )
-    {
-        l1_pgentry_t sl1e;
-        int index = guest_l1_table_offset(va);
-        int min = 1, max = 0;
-
-        unsigned long tmp_gmfn;
-        l2_pgentry_t tmp_sl2e = {0};
-        guest_l2_pgentry_t tmp_gl2e = {0};
-
-        __guest_get_l2e(v, va, &tmp_gl2e);
-        tmp_gmfn = gmfn_to_mfn(d, l2e_get_pfn(tmp_gl2e));
-        gpl1e = (guest_l1_pgentry_t *) map_domain_page(tmp_gmfn);
-
-        /* If the PGT_l1_shadow has two contiguous pages */
-#if CONFIG_PAGING_LEVELS >= 3
-        if ( SH_L1_HAS_NEXT_PAGE &&
-             d->arch.ops->guest_paging_levels == PAGING_L2 )
-            __shadow_get_l2e(v,  va & ~((1UL << L2_PAGETABLE_SHIFT_32) - 1), &tmp_sl2e);
-        else
-#endif
-        __shadow_get_l2e(v, va, &tmp_sl2e);
-
-        spl1e = (l1_pgentry_t *) map_domain_page(l2e_get_pfn(tmp_sl2e));
-
-        if ( SH_L1_HAS_NEXT_PAGE )
-            spl1e_next = (l1_pgentry_t *) map_domain_page(
-                (l2e_get_pfn(tmp_sl2e) + 1UL));
-
-        for ( i = 0; i < GUEST_L1_PAGETABLE_ENTRIES; i++ )
-        {
-            l1pte_propagate_from_guest(d, gpl1e[i], &sl1e);
-            if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
-                 unlikely(!shadow_get_page_from_l1e(sl1e, d)) )
-                sl1e = l1e_empty();
-            if ( l1e_get_flags(sl1e) == 0 )
-            {
-                // First copy entries from 0 until first invalid.
-                // Then copy entries from index until first invalid.
-                //
-                if ( i < index ) {
-                    i = index - 1;
-                    continue;
-                }
-                break;
-            }
-
-            if ( SH_L1_HAS_NEXT_PAGE && i >= L1_PAGETABLE_ENTRIES )
-                spl1e_next[i - L1_PAGETABLE_ENTRIES] = sl1e;
-            else 
-                spl1e[i] = sl1e;
-
-            if ( unlikely(i < min) )
-                min = i;
-            if ( likely(i > max) )
-                max = i;
-            set_guest_back_ptr(d, sl1e, sl1mfn, i);
-        }
-
-        mfn_to_page(sl1mfn)->tlbflush_timestamp =
-            SHADOW_ENCODE_MIN_MAX(min, max);
-
-        unmap_domain_page(gpl1e);
-        unmap_domain_page(spl1e);
-
-        if ( SH_L1_HAS_NEXT_PAGE )
-            unmap_domain_page(spl1e_next);
-    }
-}
-
-#if CONFIG_PAGING_LEVELS == 2
-static void
-shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow)
-{
-    struct vcpu *v = current;
-    struct domain *d = v->domain;
-    l2_pgentry_t sl2e = {0};
-
-    __shadow_get_l2e(v, va, &sl2e);
-    if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
-    {
-        /*
-         * Either the L1 is not shadowed, or the shadow isn't linked into
-         * the current shadow L2.
-         */
-        if ( create_l1_shadow )
-        {
-            perfc_incrc(shadow_set_l1e_force_map);
-            shadow_map_l1_into_current_l2(va);
-        }
-        else /* check to see if it exists; if so, link it in */
-        {
-            l2_pgentry_t gpde = {0};
-            unsigned long gl1pfn;
-            unsigned long sl1mfn;
-
-            __guest_get_l2e(v, va, &gpde);
-
-            if ( l2e_get_flags(gpde) & _PAGE_PRESENT )
-            {
-                gl1pfn = l2e_get_pfn(gpde);
-                sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow);
-            }
-            else
-            {
-                // no shadow exists, so there's nothing to do.
-                perfc_incrc(shadow_set_l1e_fail);
-                return;
-            }
-
-            if ( sl1mfn )
-            {
-                perfc_incrc(shadow_set_l1e_unlinked);
-                if ( !get_shadow_ref(sl1mfn) )
-                    BUG();
-                l2pde_general(d, (guest_l2_pgentry_t *)&gpde, &sl2e, sl1mfn);
-                __guest_set_l2e(v, va, &gpde);
-                __shadow_set_l2e(v, va, &sl2e);
-            }
-            else
-            {
-                // no shadow exists, so there's nothing to do.
-                perfc_incrc(shadow_set_l1e_fail);
-                return;
-            }
-        }
-    }
-
-    __shadow_get_l2e(v, va, &sl2e);
-
-    if ( shadow_mode_refcounts(d) )
-    {
-        l1_pgentry_t old_spte;
-        __shadow_get_l1e(v, va, &old_spte);
-
-        // only do the ref counting if something important changed.
-        //
-        if ( l1e_has_changed(old_spte, new_spte, _PAGE_RW | _PAGE_PRESENT) )
-        {
-            if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
-                 !shadow_get_page_from_l1e(new_spte, d) )
-                new_spte = l1e_empty();
-            if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
-                shadow_put_page_from_l1e(old_spte, d);
-        }
-    }
-
-    set_guest_back_ptr(d, new_spte, l2e_get_pfn(sl2e), l1_table_offset(va));
-    __shadow_set_l1e(v, va, &new_spte);
-    shadow_update_min_max(l2e_get_pfn(sl2e), l1_table_offset(va));
-}
-
-static void shadow_invlpg_32(struct vcpu *v, unsigned long va)
-{
-    struct domain *d = v->domain;
-    l1_pgentry_t gpte, spte;
-
-    ASSERT(shadow_mode_enabled(d));
-
-    shadow_lock(d);
-
-    __shadow_sync_va(v, va);
-
-    // XXX mafetter: will need to think about 4MB pages...
-
-    // It's not strictly necessary to update the shadow here,
-    // but it might save a fault later.
-    //
-    /*if (__copy_from_user(&gpte, &linear_pg_table[va >> PAGE_SHIFT],
-                         sizeof(gpte))) {*/
-    if (unlikely(!__guest_get_l1e(v, va, &gpte))) {
-        perfc_incrc(shadow_invlpg_faults);
-        shadow_unlock(d);
-        return;
-    }
-    l1pte_propagate_from_guest(d, gpte, &spte);
-    shadow_set_l1e(va, spte, 1);
-
-    shadow_unlock(d);
-}
-#endif /* CONFIG_PAGING_LEVELS == 2 */
-
-#if CONFIG_PAGING_LEVELS >= 3
-static void shadow_set_l1e_64(
-    unsigned long va, pgentry_64_t *sl1e_p,
-    int create_l1_shadow)
-{
-    struct vcpu *v = current;
-    struct domain *d = v->domain;
-    pgentry_64_t sle = { 0 };
-    pgentry_64_t sle_up = {0};
-    l1_pgentry_t old_spte;
-    l1_pgentry_t sl1e = *(l1_pgentry_t *)sl1e_p;
-    int i;
-    unsigned long orig_va = 0;
-
-    if ( d->arch.ops->guest_paging_levels == PAGING_L2 ) 
-    {
-        /* This is for 32-bit VMX guest on 64-bit host */
-        orig_va = va;
-        va = va & (~((1<<L2_PAGETABLE_SHIFT_32)-1));
-    }
-
-    for ( i = PAGING_L4; i >= PAGING_L2; i-- )
-    {
-        if ( !__rw_entry(v, va, &sle, SHADOW_ENTRY | GET_ENTRY | i) )
-        {
-            sl1e = l1e_empty();
-            goto out;
-        }
-        if ( !(entry_get_flags(sle) & _PAGE_PRESENT) )
-        {
-            if ( create_l1_shadow )
-            {
-                perfc_incrc(shadow_set_l3e_force_map);
-                shadow_map_into_current(v, va, i-1, i);
-                __rw_entry(v, va, &sle, SHADOW_ENTRY | GET_ENTRY | i);
-            }
-        }
-        if ( d->arch.ops->guest_paging_levels == PAGING_L3 ) 
-        {
-            if ( i < PAGING_L3 )
-                shadow_update_min_max(entry_get_pfn(sle_up), table_offset_64(va, i));
-        }
-        else 
-        {
-            if ( i < PAGING_L4 )
-                shadow_update_min_max(entry_get_pfn(sle_up), table_offset_64(va, i));
-        }
-
-        sle_up = sle;
-    }
-
-    if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
-    {
-        va = orig_va;
-    }
-
-    if ( shadow_mode_refcounts(d) )
-    {
-        __shadow_get_l1e(v, va, &old_spte);
-        if ( l1e_has_changed(old_spte, sl1e, _PAGE_RW | _PAGE_PRESENT) )
-        {
-            if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
-                 !shadow_get_page_from_l1e(sl1e, d) )
-                sl1e = l1e_empty();
-            if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
-                put_page_from_l1e(old_spte, d);
-        }
-    }
-
-out:
-    __shadow_set_l1e(v, va, &sl1e);
-
-    shadow_update_min_max(entry_get_pfn(sle_up), guest_l1_table_offset(va));
-}
-#endif /* CONFIG_PAGING_LEVELS >= 3 */
-
-static struct out_of_sync_entry *
-shadow_alloc_oos_entry(struct domain *d)
-{
-    struct out_of_sync_entry *f, *extra;
-    unsigned size, i;
-
-    if ( unlikely(d->arch.out_of_sync_free == NULL) )
-    {
-        FSH_LOG("Allocate more fullshadow tuple blocks.");
-
-        size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
-        extra = xmalloc_bytes(size);
-
-        /* XXX Should be more graceful here. */
-        if ( extra == NULL )
-            BUG();
-
-        memset(extra, 0, size);
-
-        /* Record the allocation block so it can be correctly freed later. */
-        d->arch.out_of_sync_extras_count++;
-        *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) =
-            d->arch.out_of_sync_extras;
-        d->arch.out_of_sync_extras = &extra[0];
-
-        /* Thread a free chain through the newly-allocated nodes. */
-        for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
-            extra[i].next = &extra[i+1];
-        extra[i].next = NULL;
-
-        /* Add the new nodes to the free list. */
-        d->arch.out_of_sync_free = &extra[0];
-    }
-
-    /* Allocate a new node from the quicklist. */
-    f = d->arch.out_of_sync_free;
-    d->arch.out_of_sync_free = f->next;
-
-    return f;
-}
-
-static inline unsigned long
-shadow_make_snapshot(
-    struct domain *d, unsigned long gpfn, unsigned long gmfn)
-{
-    unsigned long smfn, sl1mfn = 0;
-    void *original, *snapshot;
-    u32 min_max = 0;
-    int min, max, length;
-
-    if ( test_and_set_bit(_PGC_out_of_sync, &mfn_to_page(gmfn)->count_info) )
-    {
-        ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
-        return SHADOW_SNAPSHOT_ELSEWHERE;
-    }
-
-    perfc_incrc(shadow_make_snapshot);
-
-    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
-    {
-        printk("Couldn't alloc fullshadow snapshot for pfn=%lx mfn=%lx!\n"
-               "Dom%d snapshot_count_count=%d\n",
-               gpfn, gmfn, d->domain_id, d->arch.snapshot_page_count);
-        BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
-    }
-
-    if ( !get_shadow_ref(smfn) )
-        BUG();
-
-    if ( shadow_mode_refcounts(d) &&
-         (shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow) )
-        min_max = mfn_to_page(sl1mfn)->tlbflush_timestamp;
-    mfn_to_page(smfn)->tlbflush_timestamp = min_max;
-
-    min = SHADOW_MIN(min_max);
-    max = SHADOW_MAX(min_max);
-    length = max - min + 1;
-    perfc_incr_histo(snapshot_copies, length, PT_UPDATES);
-
-    min *= sizeof(guest_l1_pgentry_t);
-    length *= sizeof(guest_l1_pgentry_t);
-
-    original = map_domain_page(gmfn);
-    snapshot = map_domain_page(smfn);
-    memcpy(snapshot + min, original + min, length);
-    unmap_domain_page(original);
-    unmap_domain_page(snapshot);
-
-    return smfn;
-}
-
-static struct out_of_sync_entry *
-__mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
-                             unsigned long mfn)
-{
-    struct domain *d = v->domain;
-    struct page_info *page = mfn_to_page(mfn);
-    struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
-
-    ASSERT(shadow_lock_is_acquired(d));
-    ASSERT(mfn_valid(mfn));
-
-#ifndef NDEBUG
-    {
-        u32 type = page->u.inuse.type_info & PGT_type_mask;
-        if ( shadow_mode_refcounts(d) )
-        {
-            ASSERT(type == PGT_writable_page);
-        }
-        else
-        {
-            ASSERT(type && (type < PGT_l4_page_table));
-        }
-    }
-#endif
-
-    FSH_LOG("%s(gpfn=%lx, mfn=%lx) c=%08x t=%08x", __func__,
-            gpfn, mfn, page->count_info, page->u.inuse.type_info);
-
-    // XXX this will require some more thought...  Cross-domain sharing and
-    //     modification of page tables?  Hmm...
-    //
-    if ( d != page_get_owner(page) )
-        BUG();
-
-    perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
-
-    entry->v = v;
-    entry->gpfn = gpfn;
-    entry->gmfn = mfn;
-    entry->writable_pl1e = -1;
-
-#if 0 // this code has not been updated for 32pae & 64 bit modes
-#if SHADOW_DEBUG
-    mark_shadows_as_reflecting_snapshot(d, gpfn);
-#endif
-#endif
-
-    // increment guest's ref count to represent the entry in the
-    // full shadow out-of-sync list.
-    //
-    get_page(page, d);
-
-    return entry;
-}
-
-static struct out_of_sync_entry *
-mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
-                             unsigned long mfn)
-{
-    struct out_of_sync_entry *entry =
-        __mark_mfn_out_of_sync(v, gpfn, mfn);
-    struct domain *d = v->domain;
-
-    entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
-    // Add to the out-of-sync list
-    //
-    entry->next = d->arch.out_of_sync;
-    d->arch.out_of_sync = entry;
-
-    return entry;
-
-}
-
-static void shadow_mark_va_out_of_sync(
-    struct vcpu *v, unsigned long gpfn, unsigned long mfn, unsigned long va)
-{
-    struct out_of_sync_entry *entry =
-        __mark_mfn_out_of_sync(v, gpfn, mfn);
-    l2_pgentry_t sl2e;
-    struct domain *d = v->domain;
-
-#if CONFIG_PAGING_LEVELS >= 3
-    {
-        l4_pgentry_t sl4e;
-        l3_pgentry_t sl3e;
-
-        __shadow_get_l4e(v, va, &sl4e);
-        if ( !(l4e_get_flags(sl4e) & _PAGE_PRESENT)) {
-            shadow_map_into_current(v, va, PAGING_L3, PAGING_L4);
-        }
-
-        if (!__shadow_get_l3e(v, va, &sl3e)) {
-            BUG();
-        }
-
-        if ( !(l3e_get_flags(sl3e) & _PAGE_PRESENT)) {
-            shadow_map_into_current(v, va, PAGING_L2, PAGING_L3);
-        }
-    }
-#endif
-
-    // We need the address of shadow PTE that maps @va.
-    // It might not exist yet.  Make sure it's there.
-    //
-    __shadow_get_l2e(v, va, &sl2e);
-    if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
-    {
-        // either this L1 isn't shadowed yet, or the shadow isn't linked into
-        // the current L2.
-        shadow_map_l1_into_current_l2(va);
-        __shadow_get_l2e(v, va, &sl2e);
-    }
-    ASSERT(l2e_get_flags(sl2e) & _PAGE_PRESENT);
-
-    entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
-    // NB: this is stored as a machine address.
-    entry->writable_pl1e =
-        l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * l1_table_offset(va));
-    ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
-    entry->va = va;
-
-    // Increment shadow's page count to represent the reference
-    // inherent in entry->writable_pl1e
-    //
-    if ( !get_shadow_ref(l2e_get_pfn(sl2e)) )
-        BUG();
-
-    // Add to the out-of-sync list
-    //
-    entry->next = d->arch.out_of_sync;
-    d->arch.out_of_sync = entry;
-
-    FSH_LOG("%s(va=%lx -> writable_pl1e=%lx)",
-            __func__, va, entry->writable_pl1e);
-}
-
-/*
- * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
- * Returns 0 otherwise.
- */
-static int snapshot_entry_matches(
-    struct domain *d, guest_l1_pgentry_t *guest_pt,
-    unsigned long gpfn, unsigned index)
-{
-    unsigned long smfn = __shadow_status(d, gpfn, PGT_snapshot);
-    guest_l1_pgentry_t *snapshot, gpte; // could be L1s or L2s or ...
-    int entries_match;
-
-    perfc_incrc(snapshot_entry_matches_calls);
-
-    if ( !smfn )
-        return 0;
-
-    snapshot = map_domain_page(smfn);
-
-    if (__copy_from_user(&gpte, &guest_pt[index],
-                         sizeof(gpte)))
-    {
-        unmap_domain_page(snapshot);
-        return 0;
-    }
-
-    // This could probably be smarter, but this is sufficent for
-    // our current needs.
-    //
-    entries_match = !guest_l1e_has_changed(gpte, snapshot[index],
-                                     PAGE_FLAG_MASK);
-
-    unmap_domain_page(snapshot);
-
-#ifdef PERF_COUNTERS
-    if ( entries_match )
-        perfc_incrc(snapshot_entry_matches_true);
-#endif
-
-    return entries_match;
-}
-
-/*
- * Returns 1 if va's shadow mapping is out-of-sync.
- * Returns 0 otherwise.
- */
-static int is_out_of_sync(struct vcpu *v, unsigned long va) /* __shadow_out_of_sync */
-{
-    struct domain *d = v->domain;
-#if CONFIG_PAGING_LEVELS == 4
-    unsigned long l2mfn = ((v->arch.flags & TF_kernel_mode)?
-                          pagetable_get_pfn(v->arch.guest_table) :
-                          pagetable_get_pfn(v->arch.guest_table_user));
-#else
-    unsigned long l2mfn = pagetable_get_pfn(v->arch.guest_table);
-#endif
-    unsigned long l2pfn = mfn_to_gmfn(d, l2mfn);
-    guest_l2_pgentry_t l2e;
-    unsigned long l1pfn, l1mfn;
-    guest_l1_pgentry_t *guest_pt;
-
-    ASSERT(shadow_lock_is_acquired(d));
-    ASSERT(VALID_M2P(l2pfn));
-
-    perfc_incrc(shadow_out_of_sync_calls);
-
-#if CONFIG_PAGING_LEVELS >= 3
-
-#define unmap_and_return(x)                                         \
-    if ( guest_pt != (guest_l1_pgentry_t *) v->arch.guest_vtable )  \
-        unmap_domain_page(guest_pt);                                \
-    return (x);
-
-    if (d->arch.ops->guest_paging_levels >= PAGING_L3) 
-    { 
-        pgentry_64_t le;
-        unsigned long gmfn;
-        unsigned long gpfn;
-        int i;
-        unsigned int base_idx = 0;
-        base_idx = get_cr3_idxval(v);
-
-        gmfn = l2mfn;
-        gpfn = l2pfn;
-        guest_pt = (guest_l1_pgentry_t *)v->arch.guest_vtable;
-
-        for ( i = PAGING_L4; i >= PAGING_L3; i-- ) 
-        {
-            if (d->arch.ops->guest_paging_levels == PAGING_L3 
-                && i == PAGING_L4)
-                continue;       /* skip the top-level for 3-level */
-
-            if ( page_out_of_sync(mfn_to_page(gmfn)) &&
-                 !snapshot_entry_matches(
-                     d, guest_pt, gpfn, guest_table_offset_64(va, i, base_idx)) )
-            {
-                unmap_and_return (1);
-            }
-
-            le = entry_empty();
-            __rw_entry(v, va, &le, GUEST_ENTRY | GET_ENTRY | i);
-
-            if ( !(entry_get_flags(le) & _PAGE_PRESENT) )
-            {
-                unmap_and_return (0);
-            }
-            gpfn = entry_get_pfn(le);
-            gmfn = gmfn_to_mfn(d, gpfn);
-            if ( !VALID_MFN(gmfn) )
-            {
-                unmap_and_return (0);
-            }
-            if ( guest_pt != (guest_l1_pgentry_t *)v->arch.guest_vtable )
-                unmap_domain_page(guest_pt);
-            guest_pt = (guest_l1_pgentry_t *)map_domain_page(gmfn);
-        }
-
-        /* L2 */
-        if ( page_out_of_sync(mfn_to_page(gmfn)) &&
-             !snapshot_entry_matches(d, guest_pt, gpfn, l2_table_offset(va)) )
-        {
-            unmap_and_return (1);
-        }
-
-        if ( guest_pt != (guest_l1_pgentry_t *)v->arch.guest_vtable )
-            unmap_domain_page(guest_pt);
-
-    } 
-    else
-#undef unmap_and_return
-#endif /* CONFIG_PAGING_LEVELS >= 3 */
-    {
-        if ( page_out_of_sync(mfn_to_page(l2mfn)) &&
-             !snapshot_entry_matches(d, (guest_l1_pgentry_t *)v->arch.guest_vtable,
-                                     l2pfn, guest_l2_table_offset(va)) )
-            return 1;
-    }
-
-    __guest_get_l2e(v, va, &l2e);
-    if ( !(guest_l2e_get_flags(l2e) & _PAGE_PRESENT) ||
-         (guest_l2e_get_flags(l2e) & _PAGE_PSE))
-        return 0;
-
-    l1pfn = l2e_get_pfn(l2e);
-    l1mfn = gmfn_to_mfn(d, l1pfn);
-
-    // If the l1 pfn is invalid, it can't be out of sync...
-    if ( !VALID_MFN(l1mfn) )
-        return 0;
-
-    guest_pt = (guest_l1_pgentry_t *) map_domain_page(l1mfn);
-
-    if ( page_out_of_sync(mfn_to_page(l1mfn)) &&
-         !snapshot_entry_matches(
-             d, guest_pt, l1pfn, guest_l1_table_offset(va)) ) 
-    {
-        unmap_domain_page(guest_pt);
-        return 1;
-    }
-
-    unmap_domain_page(guest_pt);
-    return 0;
-}
-
-static int fix_entry(
-    struct domain *d,
-    l1_pgentry_t *pt, u32 *found, int is_l1_shadow, u32 max_refs_to_find)
-{
-    l1_pgentry_t old = *pt;
-    l1_pgentry_t new = old;
-
-    l1e_remove_flags(new,_PAGE_RW);
-    if ( is_l1_shadow && !shadow_get_page_from_l1e(new, d) )
-        BUG();
-    (*found)++;
-    *pt = new;
-    if ( is_l1_shadow )
-        shadow_put_page_from_l1e(old, d);
-
-    return (*found == max_refs_to_find);
-}
-
-static u32 remove_all_write_access_in_ptpage(
-    struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn,
-    unsigned long readonly_gpfn, unsigned long readonly_gmfn,
-    u32 max_refs_to_find, unsigned long prediction)
-{
-    l1_pgentry_t *pt = map_domain_page(pt_mfn);
-    l1_pgentry_t *pt_next = 0, *sl1e_p;
-    l1_pgentry_t match;
-    unsigned long flags = _PAGE_RW | _PAGE_PRESENT;
-    int i;
-    u32 found = 0;
-    int is_l1_shadow =
-        ((mfn_to_page(pt_mfn)->u.inuse.type_info & PGT_type_mask) ==
-         PGT_l1_shadow);
-#if CONFIG_PAGING_LEVELS >= 3
-    is_l1_shadow |=
-      ((mfn_to_page(pt_mfn)->u.inuse.type_info & PGT_type_mask) ==
-                PGT_fl1_shadow);
-#endif
-
-    if ( SH_L1_HAS_NEXT_PAGE )
-        pt_next = map_domain_page(pt_mfn + 1);
-
-    match = l1e_from_pfn(readonly_gmfn, flags);
-
-    if ( shadow_mode_external(d) ) 
-    {
-        i = (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_va_mask)
-            >> PGT_va_shift;
-
-        if ( SH_L1_HAS_NEXT_PAGE &&
-             i >= L1_PAGETABLE_ENTRIES )
-            sl1e_p = &pt_next[i - L1_PAGETABLE_ENTRIES];
-        else
-            sl1e_p = &pt[i];
-
-        if ( (i >= 0 && i < GUEST_L1_PAGETABLE_ENTRIES) &&
-             !l1e_has_changed(*sl1e_p, match, flags) &&
-             fix_entry(d, sl1e_p, &found, is_l1_shadow, max_refs_to_find) &&
-             !prediction )
-            goto out;
-    }
-
-    for ( i = 0; i < GUEST_L1_PAGETABLE_ENTRIES; i++ )
-    {
-        if ( SH_L1_HAS_NEXT_PAGE &&
-             i >= L1_PAGETABLE_ENTRIES )
-            sl1e_p = &pt_next[i - L1_PAGETABLE_ENTRIES];
-        else
-            sl1e_p = &pt[i];
-
-        if ( unlikely(!l1e_has_changed(*sl1e_p, match, flags)) &&
-             fix_entry(d, sl1e_p, &found, is_l1_shadow, max_refs_to_find) )
-            break;
-    }
-
-out:
-    unmap_domain_page(pt);
-    if ( SH_L1_HAS_NEXT_PAGE )
-        unmap_domain_page(pt_next);
-
-    return found;
-}
-
-static int remove_all_write_access(
-    struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
-{
-    int i;
-    struct shadow_status *a;
-    u32 found = 0, write_refs;
-    unsigned long predicted_smfn;
-
-    ASSERT(shadow_lock_is_acquired(d));
-    ASSERT(VALID_MFN(readonly_gmfn));
-
-    perfc_incrc(remove_write_access);
-
-    // If it's not a writable page, then no writable refs can be outstanding.
-    //
-    if ( (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_type_mask) !=
-         PGT_writable_page )
-    {
-        perfc_incrc(remove_write_not_writable);
-        return 1;
-    }
-
-    // How many outstanding writable PTEs for this page are there?
-    //
-    write_refs =
-        (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_count_mask);
-    if ( write_refs && MFN_PINNED(readonly_gmfn) )
-    {
-        write_refs--;
-    }
-
-    if ( write_refs == 0 )
-    {
-        perfc_incrc(remove_write_no_work);
-        return 1;
-    }
-
-    if ( shadow_mode_external(d) ) {
-        if (--write_refs == 0)
-            return 0;
-
-         // Use the back pointer to locate the shadow page that can contain
-         // the PTE of interest
-         if ( (predicted_smfn = mfn_to_page(readonly_gmfn)->tlbflush_timestamp) ) {
-             found += remove_all_write_access_in_ptpage(
-                 d, predicted_smfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, 0);
-             if ( found == write_refs )
-                 return 0;
-         }
-    }
-
-    // Search all the shadow L1 page tables...
-    //
-    for (i = 0; i < shadow_ht_buckets; i++)
-    {
-        a = &d->arch.shadow_ht[i];
-        while ( a && a->gpfn_and_flags )
-        {
-            if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow
-#if CONFIG_PAGING_LEVELS >= 3
-              || (a->gpfn_and_flags & PGT_type_mask) == PGT_fl1_shadow
-#endif
-              )
-
-            {
-                found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask);
-                if ( found == write_refs )
-                    return 0;
-            }
-
-            a = a->next;
-        }
-    }
-
-    FSH_LOG("%s: looking for %d refs, found %d refs",
-            __func__, write_refs, found);
-
-    return 0;
-}
-
-static void resync_pae_guest_l3(struct domain *d)
-{
-    struct out_of_sync_entry *entry;
-    unsigned long i, idx;
-    unsigned long smfn, gmfn;
-    pgentry_64_t *guest, *shadow_l3, *snapshot;
-    struct vcpu *v = current;
-    int max = -1;
-    int unshadow = 0;
-
-    
-    ASSERT( shadow_mode_external(d) );
-
-    gmfn = pagetable_get_pfn(v->arch.guest_table);
-           
-    for ( entry = d->arch.out_of_sync; entry; entry = entry->next ) 
-    {
-        if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
-            continue;
-        if ( entry->gmfn != gmfn )
-            continue;
-
-        idx = get_cr3_idxval(v);
-
-        smfn = __shadow_status(d, entry->gpfn, PGT_l4_shadow);
-
-        if ( !smfn ) 
-            continue;
-
-        guest    = (pgentry_64_t *)map_domain_page(entry->gmfn);
-        snapshot = (pgentry_64_t *)map_domain_page(entry->snapshot_mfn);
-        shadow_l3 = (pgentry_64_t *)map_domain_page(smfn);
-
-        for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ ) 
-        {
-            int index = i + idx * PAE_L3_PAGETABLE_ENTRIES;
-            if ( entry_has_changed(
-                    guest[index], snapshot[index], PAGE_FLAG_MASK) ) 
-            {
-                unsigned long gpfn;
-
-                /*
-                 * Looks like it's no longer a page table. 
-                 */
-                if ( unlikely(entry_get_value(guest[index]) & PAE_PDPT_RESERVED) )
-                {
-                    if ( entry_get_flags(shadow_l3[i]) & _PAGE_PRESENT )
-                        put_shadow_ref(entry_get_pfn(shadow_l3[i]));
-
-                    shadow_l3[i] = entry_empty();
-                    continue;
-                }
-
-                gpfn = entry_get_pfn(guest[index]);
-
-                if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
-                {
-                    if ( entry_get_flags(shadow_l3[i]) & _PAGE_PRESENT )
-                        put_shadow_ref(entry_get_pfn(shadow_l3[i]));
-
-                    shadow_l3[i] = entry_empty();
-                    continue;
-                }
-
-                validate_entry_change(d, &guest[index],
-                                      &shadow_l3[i], PAGING_L3);
-            }
-
-            if ( entry_get_value(guest[index]) != 0 )
-                max = i;
-
-            if ( !(entry_get_flags(guest[index]) & _PAGE_PRESENT) &&
-                 unlikely(entry_get_value(guest[index]) != 0) &&
-                 !unshadow &&
-                 (frame_table[smfn].u.inuse.type_info & PGT_pinned) )
-                unshadow = 1;
-
-        }
-        if ( max == -1 )
-            unshadow = 1;
-
-        unmap_domain_page(guest);
-        unmap_domain_page(snapshot);
-        unmap_domain_page(shadow_l3);
-
-        if ( unlikely(unshadow) )
-            shadow_unpin(smfn);
-        break;
-    }
-}
-
-static int resync_all(struct domain *d, u32 stype)
-{
-    struct out_of_sync_entry *entry;
-    unsigned i;
-    unsigned long smfn;
-    void *guest, *shadow, *snapshot;
-    int need_flush = 0, external = shadow_mode_external(d);
-    int unshadow;
-    int changed;
-    u32 min_max_shadow, min_max_snapshot;
-    int min_shadow, max_shadow, min_snapshot, max_snapshot;
-    struct vcpu *v;
-
-    ASSERT(shadow_lock_is_acquired(d));
-
-    for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
-    {
-        int max = -1;
-
-        if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
-            continue;
-
-        smfn = __shadow_status(d, entry->gpfn, stype);
-
-        if ( !smfn )
-        {
-            // For heavy weight shadows: no need to update refcounts if
-            // there's no shadow page.
-            //
-            if ( shadow_mode_refcounts(d) )
-                continue;
-
-            // For light weight shadows: only need up resync the refcounts to
-            // the new contents of the guest page iff this it has the right
-            // page type.
-            //
-            if ( stype != ( mfn_to_page(entry->gmfn)->u.inuse.type_info & PGT_type_mask) )
-                continue;
-        }
-
-        FSH_LOG("resyncing t=%08x gpfn=%lx gmfn=%lx smfn=%lx snapshot_mfn=%lx",
-                stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
-
-        // Compare guest's new contents to its snapshot, validating
-        // and updating its shadow as appropriate.
-        //
-        guest    = map_domain_page(entry->gmfn);
-        snapshot = map_domain_page(entry->snapshot_mfn);
-
-        if ( smfn )
-            shadow = map_domain_page(smfn);
-        else
-            shadow = NULL;
-
-        unshadow = 0;
-
-        min_max_shadow = mfn_to_page(smfn)->tlbflush_timestamp;
-        min_shadow     = SHADOW_MIN(min_max_shadow);
-        max_shadow     = SHADOW_MAX(min_max_shadow);
-
-        min_max_snapshot= mfn_to_page(entry->snapshot_mfn)->tlbflush_timestamp;
-        min_snapshot    = SHADOW_MIN(min_max_snapshot);
-        max_snapshot    = SHADOW_MAX(min_max_snapshot);
-
-        switch ( stype )
-        {
-        case PGT_l1_shadow:
-        {
-            guest_l1_pgentry_t *guest1 = guest;
-            l1_pgentry_t *shadow1 = shadow;
-            l1_pgentry_t *shadow1_next = 0, *sl1e_p;
-            guest_l1_pgentry_t *snapshot1 = snapshot;
-            int unshadow_l1 = 0;
-
-            ASSERT(shadow_mode_write_l1(d) ||
-                   shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
-
-            if ( !shadow_mode_refcounts(d) )
-                revalidate_l1(d, (l1_pgentry_t *)guest1, (l1_pgentry_t *)snapshot1);
-            if ( !smfn )
-                break;
-
-            changed = 0;
-
-            if ( SH_L1_HAS_NEXT_PAGE && shadow1 )
-                shadow1_next = map_domain_page(smfn + 1);
-
-            for ( i = min_shadow; i <= max_shadow; i++ )
-            {
-
-                if ( SH_L1_HAS_NEXT_PAGE && i >= L1_PAGETABLE_ENTRIES )
-                    sl1e_p = &shadow1_next[i - L1_PAGETABLE_ENTRIES];
-                else
-                    sl1e_p = &shadow1[i];
-
-                if ( (i < min_snapshot) || (i > max_snapshot) ||
-                     guest_l1e_has_changed(guest1[i], snapshot1[i], PAGE_FLAG_MASK) )
-                {
-                    int error;
-
-#if CONFIG_PAGING_LEVELS >= 3
-                    unsigned long gpfn;
-
-                    gpfn = guest_l1e_get_paddr(guest1[i]) >> PAGE_SHIFT;
-
-                    if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
-                    {
-                        guest_l1_pgentry_t tmp_gl1e = guest_l1e_empty();
-                        validate_pte_change(d, tmp_gl1e, sl1e_p);
-                        unshadow_l1 = 1;
-                        continue;
-                    }
-#endif
-
-                    error = validate_pte_change(d, guest1[i], sl1e_p);
-                    if ( error ==  -1 )
-                        unshadow_l1 = 1;
-                    else {
-                        need_flush |= error;
-                        if ( l1e_get_flags(*sl1e_p) & _PAGE_PRESENT )
-                            set_guest_back_ptr(d, *sl1e_p, smfn, i);
-                    }
-                    // can't update snapshots of linear page tables -- they
-                    // are used multiple times...
-                    //
-                    // snapshot[i] = new_pte;
-
-                    changed++;
-                }
-            }
-
-            if ( shadow1_next )
-                unmap_domain_page(shadow1_next);
-
-            perfc_incrc(resync_l1);
-            perfc_incr_histo(wpt_updates, changed, PT_UPDATES);
-            perfc_incr_histo(l1_entries_checked, max_shadow - min_shadow + 1, PT_UPDATES);
-
-            if ( d->arch.ops->guest_paging_levels >= PAGING_L3 &&
-                 unshadow_l1 ) {
-                pgentry_64_t l2e = { 0 };
-
-                __shadow_get_l2e(entry->v, entry->va, &l2e);
-
-                if ( entry_get_flags(l2e) & _PAGE_PRESENT ) {
-                    put_shadow_ref(entry_get_pfn(l2e));
-                    l2e = entry_empty();
-                    __shadow_set_l2e(entry->v, entry->va, &l2e);
-
-                    if (entry->v == current)
-                        need_flush = 1;
-                }
-            }
-
-            break;
-        }
-#if CONFIG_PAGING_LEVELS == 2
-        case PGT_l2_shadow:
-        {
-            l2_pgentry_t *guest2 = guest;
-            l2_pgentry_t *shadow2 = shadow;
-            l2_pgentry_t *snapshot2 = snapshot;
-
-            ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
-            BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
-
-            changed = 0;
-            for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
-            {
-                if ( !is_guest_l2_slot(0,i) && !external )
-                    continue;
-
-                l2_pgentry_t new_pde = guest2[i];
-                if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK))
-                {
-                    need_flush |= validate_pde_change(d, new_pde, &shadow2[i]);
-
-                    // can't update snapshots of linear page tables -- they
-                    // are used multiple times...
-                    //
-                    // snapshot[i] = new_pde;
-
-                    changed++;
-                }
-                if ( l2e_get_intpte(new_pde) != 0 ) /* FIXME: check flags? */
-                    max = i;
-
-                // XXX - This hack works for linux guests.
-                //       Need a better solution long term.
-                if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) &&
-                     unlikely(l2e_get_intpte(new_pde) != 0) &&
-                     !unshadow && MFN_PINNED(smfn) )
-                    unshadow = 1;
-            }
-            if ( max == -1 )
-                unshadow = 1;
-            perfc_incrc(resync_l2);
-            perfc_incr_histo(shm_l2_updates, changed, PT_UPDATES);
-            break;
-        }
-        case PGT_hl2_shadow:
-        {
-            l2_pgentry_t *guest2 = guest;
-            l2_pgentry_t *snapshot2 = snapshot;
-            l1_pgentry_t *shadow2 = shadow;
-
-            ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
-            BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
-
-            changed = 0;
-            for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
-            {
-                if ( !is_guest_l2_slot(0, i) && !external )
-                    continue;
-
-                l2_pgentry_t new_pde = guest2[i];
-                if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK) )
-                {
-                    need_flush |= validate_hl2e_change(d, new_pde, &shadow2[i]);
-
-                    // can't update snapshots of linear page tables -- they
-                    // are used multiple times...
-                    //
-                    // snapshot[i] = new_pde;
-
-                    changed++;
-                }
-            }
-            perfc_incrc(resync_hl2);
-            perfc_incr_histo(shm_hl2_updates, changed, PT_UPDATES);
-            break;
-        }
-#elif CONFIG_PAGING_LEVELS >= 3
-        case PGT_l2_shadow:
-        case PGT_l3_shadow:
-        {
-            pgentry_64_t *guest_pt = guest;
-            pgentry_64_t *shadow_pt = shadow;
-            pgentry_64_t *snapshot_pt = snapshot;
-
-            changed = 0;
-            for ( i = min_shadow; i <= max_shadow; i++ )
-            {
-                if ( (i < min_snapshot) || (i > max_snapshot) ||
-                    entry_has_changed(
-                        guest_pt[i], snapshot_pt[i], PAGE_FLAG_MASK) )
-                {
-                    unsigned long gpfn;
-
-                    gpfn = entry_get_pfn(guest_pt[i]);
-                    /*
-                     * Looks like it's no longer a page table.
-                     */
-                    if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
-                    {
-                        if ( entry_get_flags(shadow_pt[i]) & _PAGE_PRESENT )
-                            put_shadow_ref(entry_get_pfn(shadow_pt[i]));
-                         shadow_pt[i] = entry_empty();
-                        continue;
-                    }
-
-                    need_flush |= validate_entry_change(
-                        d, &guest_pt[i], &shadow_pt[i],
-                        shadow_type_to_level(stype));
-                    changed++;
-                }
-#if CONFIG_PAGING_LEVELS == 3
-                if ( stype == PGT_l3_shadow ) 
-                {
-                    if ( entry_get_value(guest_pt[i]) != 0 ) 
-                        max = i;
-
-                    if ( !(entry_get_flags(guest_pt[i]) & _PAGE_PRESENT) &&
-                         unlikely(entry_get_value(guest_pt[i]) != 0) &&
-                         !unshadow &&
-                         (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) )
-                        unshadow = 1;
-                }
-#endif
-            }
-
-            if ( d->arch.ops->guest_paging_levels == PAGING_L3
-                 && max == -1 && stype == PGT_l3_shadow )
-                unshadow = 1;
-
-            perfc_incrc(resync_l3);
-            perfc_incr_histo(shm_l3_updates, changed, PT_UPDATES);
-            break;
-        }
-        case PGT_l4_shadow:
-        {
-            guest_root_pgentry_t *guest_root = guest;
-            guest_root_pgentry_t *snapshot_root = snapshot;
-
-            changed = 0;
-            for ( i = 0; i < GUEST_ROOT_PAGETABLE_ENTRIES; i++ )
-            {
-                guest_root_pgentry_t new_root_e = guest_root[i];
-                if ( !is_guest_l4_slot(i) && !external )
-                    continue;
-                if ( root_entry_has_changed(
-                        new_root_e, snapshot_root[i], PAGE_FLAG_MASK))
-                {
-#ifndef GUEST_PGENTRY_32
-                    l4_pgentry_t *shadow4 = shadow;
-                    unsigned long gpfn;
-
-                    gpfn = l4e_get_pfn(new_root_e);
-
-                    /*
-                     * Looks like it's no longer a page table.
-                     */
-                    if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
-                    {
-                        if ( l4e_get_flags(shadow4[i]) & _PAGE_PRESENT )
-                            put_shadow_ref(l4e_get_pfn(shadow4[i]));
-                        shadow4[i] = l4e_empty();
-                        continue;
-                    }
-
-                    if ( d->arch.ops->guest_paging_levels == PAGING_L4 ) 
-                    {
-                        need_flush |= validate_entry_change(
-                          d, (pgentry_64_t *)&new_root_e,
-                          (pgentry_64_t *)&shadow4[i], shadow_type_to_level(stype));
-                    }
-                    else
-#endif
-                    {
-                        validate_bl2e_change(d, &new_root_e, shadow, i);
-                    }
-                    changed++;
-                    ESH_LOG("%d: shadow4 mfn: %lx, shadow root: %lx\n", i,
-                      smfn, pagetable_get_paddr(current->arch.shadow_table));
-                }
-                if ( guest_root_get_intpte(new_root_e) != 0 ) /* FIXME: check flags? */
-                    max = i;
-
-                //  Need a better solution in the long term.
-                if ( !(guest_root_get_flags(new_root_e) & _PAGE_PRESENT) &&
-                     unlikely(guest_root_get_intpte(new_root_e) != 0) &&
-                     !unshadow &&
-                     (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) )
-                    unshadow = 1;
-            }
-            if ( max == -1 )
-                unshadow = 1;
-            perfc_incrc(resync_l4);
-            perfc_incr_histo(shm_l4_updates, changed, PT_UPDATES);
-            break;
-        }
-
-#endif /* CONFIG_PAGING_LEVELS >= 3 */
-        default:
-            BUG();
-        }
-
-        if ( smfn )
-            unmap_domain_page(shadow);
-        unmap_domain_page(snapshot);
-        unmap_domain_page(guest);
-
-        if ( unlikely(unshadow && stype == PGT_root_page_table) )
-        {
-            for_each_vcpu(d, v)
-                if(smfn == pagetable_get_pfn(v->arch.shadow_table))
-                    return need_flush;
-            perfc_incrc(unshadow_l2_count);
-            shadow_unpin(smfn);
-#if CONFIG_PAGING_LEVELS == 2
-            if ( unlikely(shadow_mode_external(d)) )
-            {
-                unsigned long hl2mfn;
-
-                if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) &&
-                     MFN_PINNED(hl2mfn) )
-                    shadow_unpin(hl2mfn);
-            }
-#endif
-        }
-    }
-
-    return need_flush;
-}
-
-#if CONFIG_PAGING_LEVELS == 2
-static int resync_all_levels_guest_page(struct domain *d)
-{
-    int need_flush = 0;
-
-    need_flush |= resync_all(d, PGT_l1_shadow);
-    if ( d->arch.ops->guest_paging_levels == PAGING_L2 &&
-         shadow_mode_translate(d) )
-    {
-        need_flush |= resync_all(d, PGT_hl2_shadow);
-    }
-    return need_flush;
-}
-#elif CONFIG_PAGING_LEVELS == 3
-static int resync_all_levels_guest_page(struct domain *d)
-{
-    int need_flush = 0;
-
-    need_flush |= resync_all(d, PGT_l1_shadow);
-    if ( d->arch.ops->guest_paging_levels == PAGING_L2 ) 
-        need_flush |= resync_all(d, PGT_l4_shadow);
-    else
-    {
-        need_flush |= resync_all(d, PGT_l2_shadow);
-        if ( shadow_mode_log_dirty(d) )
-        {
-            need_flush |= resync_all(d, PGT_l3_shadow);
-            need_flush |= resync_all(d, PGT_l4_shadow);
-        }
-        else
-            resync_pae_guest_l3(d);
-    }
-    
-    return need_flush;
-}
-#elif CONFIG_PAGING_LEVELS == 4
-static int resync_all_levels_guest_page(struct domain *d)
-{
-    int need_flush = 0;
-
-    need_flush |= resync_all(d, PGT_l1_shadow);
-    if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
-        need_flush |= resync_all(d, PGT_l4_shadow);
-    else
-    {
-        need_flush |= resync_all(d, PGT_l2_shadow);
-        if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
-            resync_pae_guest_l3(d);
-        else
-        {
-            need_flush |= resync_all(d, PGT_l3_shadow);
-            need_flush |= resync_all(d, PGT_l4_shadow);
-        }
-    }
-    return need_flush;
-}
-#endif
-
-static void sync_all(struct domain *d)
-{
-    struct out_of_sync_entry *entry;
-    int need_flush = 0;
-    l1_pgentry_t *ppte, opte, npte;
-    cpumask_t other_vcpus_mask;
-
-    perfc_incrc(shadow_sync_all);
-
-    ASSERT(shadow_lock_is_acquired(d));
-
-    // First, remove all write permissions to the page tables
-    //
-    for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
-    {
-        // Skip entries that have low bits set...  Those aren't
-        // real PTEs.
-        //
-        if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
-            continue;
-
-        ppte = (l1_pgentry_t *)(
-            (char *)map_domain_page(entry->writable_pl1e >> PAGE_SHIFT) +
-            (entry->writable_pl1e & ~PAGE_MASK));
-        opte = npte = *ppte;
-        l1e_remove_flags(npte, _PAGE_RW);
-
-        if ( (l1e_get_flags(npte) & _PAGE_PRESENT) &&
-             !shadow_get_page_from_l1e(npte, d) )
-            BUG();
-        *ppte = npte;
-        set_guest_back_ptr(d, npte, (entry->writable_pl1e) >> PAGE_SHIFT,
-                           (entry->writable_pl1e & ~PAGE_MASK)/sizeof(l1_pgentry_t));
-        shadow_put_page_from_l1e(opte, d);
-
-        unmap_domain_page(ppte);
-    }
-
-    /* Other VCPUs mustn't use the revoked writable mappings. */
-    other_vcpus_mask = d->domain_dirty_cpumask;
-    cpu_clear(smp_processor_id(), other_vcpus_mask);
-    flush_tlb_mask(other_vcpus_mask);
-
-    /* Flush ourself later. */
-    need_flush = 1;
-
-    need_flush |= resync_all_levels_guest_page(d);
-
-    if ( need_flush && !unlikely(shadow_mode_external(d)) )
-        local_flush_tlb();
-
-    free_out_of_sync_state(d);
-}
-
-static inline int l1pte_write_fault(
-    struct vcpu *v, guest_l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p,
-    unsigned long va)
-{
-    struct domain *d = v->domain;
-    guest_l1_pgentry_t gpte = *gpte_p;
-    l1_pgentry_t spte;
-    unsigned long gpfn = l1e_get_pfn(gpte);
-    unsigned long gmfn = gmfn_to_mfn(d, gpfn);
-
-    //printk("l1pte_write_fault gmfn=%lx\n", gmfn);
-
-    if ( unlikely(!VALID_MFN(gmfn)) )
-    {
-        SH_VLOG("l1pte_write_fault: invalid gpfn=%lx", gpfn);
-        *spte_p = l1e_empty();
-        return 0;
-    }
-
-    ASSERT(guest_l1e_get_flags(gpte) & _PAGE_RW);
-    guest_l1e_add_flags(gpte, _PAGE_DIRTY | _PAGE_ACCESSED);
-    spte = l1e_from_pfn(gmfn, guest_l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
-
-    SH_VVLOG("l1pte_write_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
-             l1e_get_intpte(spte), l1e_get_intpte(gpte));
-
-    __mark_dirty(d, gmfn);
-
-    if ( mfn_is_page_table(gmfn) )
-        shadow_mark_va_out_of_sync(v, gpfn, gmfn, va);
-
-    *gpte_p = gpte;
-    *spte_p = spte;
-
-    return 1;
-}
-
-static inline int l1pte_read_fault(
-    struct domain *d, guest_l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p)
-{
-    guest_l1_pgentry_t gpte = *gpte_p;
-    l1_pgentry_t spte = *spte_p;
-    unsigned long pfn = l1e_get_pfn(gpte);
-    unsigned long mfn = gmfn_to_mfn(d, pfn);
-
-    if ( unlikely(!VALID_MFN(mfn)) )
-    {
-        SH_VLOG("l1pte_read_fault: invalid gpfn=%lx", pfn);
-        *spte_p = l1e_empty();
-        return 0;
-    }
-
-    guest_l1e_add_flags(gpte, _PAGE_ACCESSED);
-    spte = l1e_from_pfn(mfn, guest_l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
-
-    if ( shadow_mode_log_dirty(d) || !(guest_l1e_get_flags(gpte) & _PAGE_DIRTY) ||
-         mfn_is_page_table(mfn) )
-    {
-        l1e_remove_flags(spte, _PAGE_RW);
-    }
-
-    SH_VVLOG("l1pte_read_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
-             l1e_get_intpte(spte), l1e_get_intpte(gpte));
-    *gpte_p = gpte;
-    *spte_p = spte;
-
-    return 1;
-}
-#if CONFIG_PAGING_LEVELS == 2
-static int shadow_fault_32(unsigned long va, struct cpu_user_regs *regs)
-{
-    l1_pgentry_t gpte, spte, orig_gpte;
-    struct vcpu *v = current;
-    struct domain *d = v->domain;
-    l2_pgentry_t gpde;
-
-    spte = l1e_empty();
-
-    SH_VVLOG("shadow_fault( va=%lx, code=%lu )",
-             va, (unsigned long)regs->error_code);
-    perfc_incrc(shadow_fault_calls);
-
-    check_pagetable(v, "pre-sf");
-
-    /*
-     * Don't let someone else take the guest's table pages out-of-sync.
-     */
-    shadow_lock(d);
-
-    /* XXX - FIX THIS COMMENT!!!
-     * STEP 1. Check to see if this fault might have been caused by an
-     *         out-of-sync table page entry, or if we should pass this
-     *         fault onto the guest.
-     */
-    __shadow_sync_va(v, va);
-
-    /*
-     * STEP 2. Check the guest PTE.
-     */
-    __guest_get_l2e(v, va, &gpde);
-    if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
-    {
-        SH_VVLOG("shadow_fault - EXIT: L1 not present");
-        perfc_incrc(shadow_fault_bail_pde_not_present);
-        goto fail;
-    }
-
-    // This can't fault because we hold the shadow lock and we've ensured that
-    // the mapping is in-sync, so the check of the PDE's present bit, above,
-    // covers this access.
-    //
-    //orig_gpte = gpte = linear_pg_table[l1_linear_offset(va)];
-    __guest_get_l1e(v, va, &gpte);
-    orig_gpte = gpte;
-
-    if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) )
-    {
-        SH_VVLOG("shadow_fault - EXIT: gpte not present (%" PRIpte ")",
-                 l1e_get_intpte(gpte));
-        perfc_incrc(shadow_fault_bail_pte_not_present);
-        goto fail;
-    }
-
-    /* Write fault? */
-    if ( regs->error_code & 2 )
-    {
-        int allow_writes = 0;
-
-        if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) )
-        {
-            if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gpte)) )
-            {
-                allow_writes = 1;
-                l1e_add_flags(gpte, _PAGE_RW);
-            }
-            else
-            {
-                /* Write fault on a read-only mapping. */
-                SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")",
-                         l1e_get_intpte(gpte));
-                perfc_incrc(shadow_fault_bail_ro_mapping);
-                goto fail;
-            }
-        }
-        else if ( unlikely(!shadow_mode_wr_pt_pte(d) && mfn_is_page_table(l1e_get_pfn(gpte))) )
-        {
-            SH_LOG("l1pte_write_fault: no write access to page table page");
-            domain_crash_synchronous();
-        }
-
-        if ( unlikely(!l1pte_write_fault(v, &gpte, &spte, va)) )
-        {
-            SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
-            perfc_incrc(write_fault_bail);
-            shadow_unlock(d);
-            return 0;
-        }
-
-        if ( allow_writes )
-            l1e_remove_flags(gpte, _PAGE_RW);
-    }
-    else
-    {
-        if ( !l1pte_read_fault(d, &gpte, &spte) )
-        {
-            SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
-            perfc_incrc(read_fault_bail);
-            shadow_unlock(d);
-            return 0;
-        }
-    }
-
-    /*
-     * STEP 3. Write the modified shadow PTE and guest PTE back to the tables.
-     */
-    if ( l1e_has_changed(orig_gpte, gpte, PAGE_FLAG_MASK) )
-    {
-        /* XXX Watch out for read-only L2 entries! (not used in Linux). */
-        /*if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
-                                     &gpte, sizeof(gpte))) )*/
-        if ( unlikely(!__guest_set_l1e(v, va, &gpte)))
-        {
-            printk("%s() failed, crashing domain %d "
-                   "due to a read-only L2 page table (gpde=%" PRIpte "), va=%lx\n",
-                   __func__,d->domain_id, l2e_get_intpte(gpde), va);
-            domain_crash_synchronous();
-        }
-
-        __mark_dirty(d, gmfn_to_mfn(d, l2e_get_pfn(gpde)));
-    }
-
-    shadow_set_l1e(va, spte, 1);
-
-    perfc_incrc(shadow_fault_fixed);
-    d->arch.shadow_fault_count++;
-
-    shadow_unlock(d);
-
-    check_pagetable(v, "post-sf");
-    return EXCRET_fault_fixed;
-
-fail:
-    shadow_unlock(d);
-    return 0;
-}
-#endif /* CONFIG_PAGING_LEVELS == 2 */
-
-static inline unsigned long va_to_l1mfn(struct vcpu *v, unsigned long va)
-{
-    struct domain *d = v->domain;
-    guest_l2_pgentry_t gl2e = {0};
-
-    __guest_get_l2e(v, va, &gl2e);
-    
-    if ( unlikely(!(guest_l2e_get_flags(gl2e) & _PAGE_PRESENT)) )
-        return INVALID_MFN;
-
-    return gmfn_to_mfn(d, l2e_get_pfn(gl2e));
-}
-
-static int do_update_va_mapping(unsigned long va,
-                                l1_pgentry_t val,
-                                struct vcpu *v)
-{
-    struct domain *d = v->domain;
-    l1_pgentry_t spte;
-    int rc = 0;
-
-    shadow_lock(d);
-
-    // This is actually overkill - we don't need to sync the L1 itself,
-    // just everything involved in getting to this L1 (i.e. we need
-    // linear_pg_table[l1_linear_offset(va)] to be in sync)...
-    //
-    __shadow_sync_va(v, va);
-
-    l1pte_propagate_from_guest(d, *(guest_l1_pgentry_t *)&val, &spte);
-#if CONFIG_PAGING_LEVELS == 2
-    shadow_set_l1e(va, spte, 0);
-#elif CONFIG_PAGING_LEVELS >= 3
-    shadow_set_l1e_64(va, (pgentry_64_t *) &spte, 0);
-#endif
-    /*
-     * If we're in log-dirty mode then we need to note that we've updated
-     * the PTE in the PT-holding page. We need the machine frame number
-     * for this.
-     */
-    __mark_dirty(d, va_to_l1mfn(v, va));
-
-    shadow_unlock(d);
-
-    return rc;
-}
-
-
-/*
- * What lives where in the 32-bit address space in the various shadow modes,
- * and what it uses to get/maintain that mapping.
- *
- * SHADOW MODE:      none         enable         translate         external
- *
- * 4KB things:
- * guest_vtable    lin_l2     mapped per gl2   lin_l2 via hl2   mapped per gl2
- * shadow_vtable     n/a         sh_lin_l2       sh_lin_l2      mapped per gl2
- * hl2_vtable        n/a            n/a        lin_hl2 via hl2  mapped per gl2
- * monitor_vtable    n/a            n/a             n/a           mapped once
- *
- * 4MB things:
- * guest_linear  lin via gl2    lin via gl2      lin via hl2      lin via hl2
- * shadow_linear     n/a      sh_lin via sl2   sh_lin via sl2   sh_lin via sl2
- * monitor_linear    n/a            n/a             n/a              ???
- * perdomain      perdomain      perdomain       perdomain        perdomain
- * R/O M2P         R/O M2P        R/O M2P           n/a              n/a
- * R/W M2P         R/W M2P        R/W M2P         R/W M2P          R/W M2P
- * P2M               n/a            n/a           R/O M2P          R/O M2P
- *
- * NB:
- * update_pagetables(), shadow_update_pagetables(), shadow_mode_enable(),
- * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
- * all play a part in maintaining these mappings.
- */
-static void shadow_update_pagetables(struct vcpu *v)
-{
-    struct domain *d = v->domain;
-#if CONFIG_PAGING_LEVELS == 4
-    unsigned long gmfn = ((v->arch.flags & TF_kernel_mode)?
-                          pagetable_get_pfn(v->arch.guest_table) :
-                          pagetable_get_pfn(v->arch.guest_table_user));
-#else
-    unsigned long gmfn = pagetable_get_pfn(v->arch.guest_table);
-#endif
-
-    unsigned long gpfn = mfn_to_gmfn(d, gmfn);
-    unsigned long smfn, old_smfn;
-
-#if CONFIG_PAGING_LEVELS == 2
-    unsigned long hl2mfn;
-#endif
-    int need_sync = 0;
-
-    int max_mode = ( shadow_mode_external(d) ? SHM_external
-                     : shadow_mode_translate(d) ? SHM_translate
-                     : shadow_mode_enabled(d) ? SHM_enable
-                     : 0 );
-
-    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
-    ASSERT( max_mode );
-
-    /*
-     *  arch.guest_vtable
-     */
-    if ( max_mode & (SHM_enable | SHM_external) )
-    {
-        if ( likely(v->arch.guest_vtable != NULL) )
-            unmap_domain_page_global(v->arch.guest_vtable);
-        v->arch.guest_vtable = map_domain_page_global(gmfn);
-    }
-
-    /*
-     *  arch.shadow_table
-     */
-#if CONFIG_PAGING_LEVELS == 3 & defined (GUEST_PGENTRY_32)
-    /*
-     * We use PGT_l4_shadow for 2-level paging guests on PAE
-     */
-    if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
-    { 
-        if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_l4_shadow))) )
-            smfn = shadow_l3_table(v, gpfn, gmfn);
-    } 
-    else
-#endif
-
-#if CONFIG_PAGING_LEVELS == 3 & defined ( GUEST_32PAE )
-    /*
-     * We use PGT_l4_shadow for 2-level paging guests on PAE
-     */
-    if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
-    {
-        if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_l4_shadow))) )
-            smfn = shadow_l3_table(v, gpfn, gmfn);
-        else
-        {
-            update_top_level_shadow(v, smfn);
-            need_sync = 1;
-        }
-    }
-    else
-#endif
-    if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) ) 
-    {
-#if CONFIG_PAGING_LEVELS == 2
-        smfn = shadow_l2_table(v, gpfn, gmfn);
-#elif CONFIG_PAGING_LEVELS == 3
-        smfn = shadow_l3_table(v, gpfn, gmfn);
-#elif CONFIG_PAGING_LEVELS == 4
-        smfn = shadow_l4_table(v, gpfn, gmfn);
-#endif
-    }
-    else
-    {
-#if CONFIG_PAGING_LEVELS >= 3
-        if ( SH_GUEST_32PAE && d->arch.ops->guest_paging_levels == PAGING_L3 )
-            update_top_level_shadow(v, smfn);
-#endif
-        /*
-         *  move sync later in order to avoid this smfn been 
-         *  unshadowed occasionally
-         */
-        need_sync = 1;
-    }
-
-
-    if ( !get_shadow_ref(smfn) )
-        BUG();
-    old_smfn = pagetable_get_pfn(v->arch.shadow_table);
-    v->arch.shadow_table = pagetable_from_pfn(smfn);
-    if ( old_smfn )
-        put_shadow_ref(old_smfn);
-
-    SH_VVLOG("shadow_update_pagetables(gmfn=%lx, smfn=%lx)", gmfn, smfn);
-
-    /*
-     * arch.shadow_vtable
-     */
-    if ( max_mode == SHM_external
-#if CONFIG_PAGING_LEVELS >=3
-         || max_mode & SHM_enable
-#endif
-        )
-    {
-        if ( v->arch.shadow_vtable )
-            unmap_domain_page_global(v->arch.shadow_vtable);
-        v->arch.shadow_vtable = map_domain_page_global(smfn);
-    }
-
-#if CONFIG_PAGING_LEVELS == 2
-    /*
-     * arch.hl2_vtable
-     */
-
-    // if max_mode == SHM_translate, then the hl2 is already installed
-    // correctly in its smfn, and there's nothing to do.
-    //
-    if ( max_mode == SHM_external )
-    {
-        if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
-            hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
-        if ( v->arch.hl2_vtable )
-            unmap_domain_page_global(v->arch.hl2_vtable);
-        v->arch.hl2_vtable = map_domain_page_global(hl2mfn);
-    }
-
-    /*
-     * fixup pointers in monitor table, as necessary
-     */
-    if ( max_mode == SHM_external )
-    {
-        l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
-        l2_pgentry_t old_hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
-        l2_pgentry_t old_sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
-
-        ASSERT( shadow_mode_translate(d) );
-
-        if ( !get_shadow_ref(hl2mfn) )
-            BUG();
-        mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
-            l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
-        if ( l2e_get_flags(old_hl2e) & _PAGE_PRESENT )
-            put_shadow_ref(l2e_get_pfn(old_hl2e));
-
-        if ( !get_shadow_ref(smfn) )
-            BUG();
-        mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
-            l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
-        if ( l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
-            put_shadow_ref(l2e_get_pfn(old_sl2e));
-
-        // XXX - maybe this can be optimized somewhat??
-        local_flush_tlb();
-    }
-#endif /* CONFIG_PAGING_LEVELS == 2 */
-
-#if CONFIG_PAGING_LEVELS == 3
-    /*
-     * fixup pointers in monitor table, as necessary
-     */
-    if ( max_mode == SHM_external )
-    {
-        l3_pgentry_t *mpl3e = (l3_pgentry_t *) v->arch.monitor_vtable;
-        l2_pgentry_t *spl2e;
-        unsigned long s2mfn;
-        int i;
-        ASSERT( shadow_mode_translate(d) );
-        s2mfn = l3e_get_pfn(mpl3e[L3_PAGETABLE_ENTRIES - 1]);
-        ASSERT( s2mfn);
-        spl2e = map_domain_page(s2mfn);
-        for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
-            spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] =
-                (l3e_get_flags(mpl3e[i]) & _PAGE_PRESENT) ?
-                l2e_from_pfn(l3e_get_pfn(mpl3e[i]), __PAGE_HYPERVISOR) :
-                l2e_empty();
-        unmap_domain_page(spl2e);
-        local_flush_tlb();
-    }
-#endif
-
-    if(likely(need_sync))
-        shadow_sync_all(d);
-}
-
-
-/************************************************************************/
-/************************************************************************/
-/************************************************************************/
-
-#if 0 // this code has not been updated for 32pae & 64 bit modes
-#if SHADOW_DEBUG
-
-// The following is entirely for _check_pagetable()'s benefit.
-// _check_pagetable() wants to know whether a given entry in a
-// shadow page table is supposed to be the shadow of the guest's
-// current entry, or the shadow of the entry held in the snapshot
-// taken above.
-//
-// Here, we mark all currently existing entries as reflecting
-// the snapshot, above.  All other places in xen that update
-// the shadow will keep the shadow in sync with the guest's
-// entries (via l1pte_propagate_from_guest and friends), which clear
-// the SHADOW_REFLECTS_SNAPSHOT bit.
-//
-static void
-mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn)
-{
-    unsigned long smfn;
-    l1_pgentry_t *l1e;
-    l2_pgentry_t *l2e;
-    unsigned i;
-
-    if ( (smfn = __shadow_status(d, gpfn, PGT_l1_shadow)) )
-    {
-        l1e = map_domain_page(smfn);
-        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
-            if ( is_guest_l1_slot(i) &&
-                 (l1e_get_flags(l1e[i]) & _PAGE_PRESENT) )
-                l1e_add_flags(l1e[i], SHADOW_REFLECTS_SNAPSHOT);
-        unmap_domain_page(l1e);
-    }
-
-    if ( (smfn = __shadow_status(d, gpfn, PGT_l2_shadow)) )
-    {
-        l2e = map_domain_page(smfn);
-        for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
-            if ( is_guest_l2_slot(0, i) &&
-                 (l2e_get_flags(l2e[i]) & _PAGE_PRESENT) )
-                l2e_add_flags(l2e[i], SHADOW_REFLECTS_SNAPSHOT);
-        unmap_domain_page(l2e);
-    }
-}
-
-// BUG: these are not SMP safe...
-static int sh_l2_present;
-static int sh_l1_present;
-static char *sh_check_name;
-// int shadow_status_noswap; // declared in shadow32.c
-
-#define v2m(_v, _adr) ({                                                     \
-    unsigned long _a  = (unsigned long)(_adr);                               \
-    l2_pgentry_t _pde = shadow_linear_l2_table(_v)[l2_table_offset(_a)];     \
-    unsigned long _pa = -1;                                                  \
-    if ( l2e_get_flags(_pde) & _PAGE_PRESENT )                               \
-    {                                                                        \
-        l1_pgentry_t _pte;                                                   \
-        _pte = shadow_linear_pg_table[l1_linear_offset(_a)];                 \
-        if ( l1e_get_flags(_pte) & _PAGE_PRESENT )                           \
-            _pa = l1e_get_paddr(_pte);                                       \
-    }                                                                        \
-    _pa | (_a & ~PAGE_MASK);                                                 \
-})
-
-#define FAIL(_f, _a...)                                                      \
-    do {                                                                     \
-        printk("XXX %s-FAIL (%d,%d,%d) " _f " at %s(%d)\n",                  \
-               sh_check_name, level, l2_idx, l1_idx, ## _a,                  \
-               __FILE__, __LINE__);                                          \
-        printk("guest_pte=%" PRIpte " eff_guest_pte=%" PRIpte                \
-               " shadow_pte=%" PRIpte " snapshot_pte=%" PRIpte               \
-               " &guest=%p &shadow=%p &snap=%p v2m(&guest)=%p"               \
-               " v2m(&shadow)=%p v2m(&snap)=%p ea=%08x\n",                   \
-               l1e_get_intpte(guest_pte), l1e_get_intpte(eff_guest_pte),     \
-               l1e_get_intpte(shadow_pte), l1e_get_intpte(snapshot_pte),     \
-               p_guest_pte, p_shadow_pte, p_snapshot_pte,                    \
-               (void *)v2m(v, p_guest_pte), (void *)v2m(v, p_shadow_pte),    \
-               (void *)v2m(v, p_snapshot_pte),                               \
-               (l2_idx << L2_PAGETABLE_SHIFT) |                              \
-               (l1_idx << L1_PAGETABLE_SHIFT));                              \
-        errors++;                                                            \
-    } while ( 0 )
-
-static int check_pte(
-    struct vcpu *v,
-    l1_pgentry_t *p_guest_pte,
-    l1_pgentry_t *p_shadow_pte,
-    l1_pgentry_t *p_snapshot_pte,
-    int level, int l2_idx, int l1_idx)
-{
-    struct domain *d = v->domain;
-    l1_pgentry_t guest_pte = *p_guest_pte;
-    l1_pgentry_t shadow_pte = *p_shadow_pte;
-    l1_pgentry_t snapshot_pte = p_snapshot_pte ? *p_snapshot_pte : l1e_empty();
-    l1_pgentry_t eff_guest_pte;
-    unsigned long mask, eff_guest_pfn, eff_guest_mfn, shadow_mfn;
-    int errors = 0, guest_writable;
-    int page_table_page;
-
-    if ( (l1e_get_intpte(shadow_pte) == 0) ||
-         (l1e_get_intpte(shadow_pte) == 0xdeadface) ||
-         (l1e_get_intpte(shadow_pte) == 0x00000E00) )
-        return errors;  /* always safe */
-
-    if ( !(l1e_get_flags(shadow_pte) & _PAGE_PRESENT) )
-        FAIL("Non zero not present shadow_pte");
-
-    if ( level == 2 ) sh_l2_present++;
-    if ( level == 1 ) sh_l1_present++;
-
-    if ( (l1e_get_flags(shadow_pte) & SHADOW_REFLECTS_SNAPSHOT) && p_snapshot_pte )
-        eff_guest_pte = snapshot_pte;
-    else
-        eff_guest_pte = guest_pte;
-
-    if ( !(l1e_get_flags(eff_guest_pte) & _PAGE_PRESENT) )
-        FAIL("Guest not present yet shadow is");
-
-    mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_AVAIL|PAGE_MASK);
-
-    if ( ((l1e_get_intpte(shadow_pte) & mask) != (l1e_get_intpte(eff_guest_pte) & mask)) )
-        FAIL("Corrupt?");
-
-    if ( (level == 1) &&
-         (l1e_get_flags(shadow_pte) & _PAGE_DIRTY) &&
-         !(l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY) )
-        FAIL("Dirty coherence");
-
-    if ( (l1e_get_flags(shadow_pte) & _PAGE_ACCESSED) &&
-         !(l1e_get_flags(eff_guest_pte) & _PAGE_ACCESSED) )
-        FAIL("Accessed coherence");
-
-    if ( l1e_get_flags(shadow_pte) & _PAGE_GLOBAL )
-        FAIL("global bit set in shadow");
-
-    eff_guest_pfn = l1e_get_pfn(eff_guest_pte);
-    eff_guest_mfn = gmfn_to_mfn(d, eff_guest_pfn);
-    shadow_mfn = l1e_get_pfn(shadow_pte);
-
-    if ( !VALID_MFN(eff_guest_mfn) && !shadow_mode_refcounts(d) )
-        FAIL("%s: invalid eff_guest_pfn=%lx eff_guest_pte=%" PRIpte "\n",
-             __func__, eff_guest_pfn, l1e_get_intpte(eff_guest_pte));
-
-    page_table_page = mfn_is_page_table(eff_guest_mfn);
-
-    guest_writable =
-        (l1e_get_flags(eff_guest_pte) & _PAGE_RW) ||
-        (shadow_mode_write_l1(d) && (level == 1) && mfn_out_of_sync(eff_guest_mfn));
-
-    if ( (l1e_get_flags(shadow_pte) & _PAGE_RW ) && !guest_writable )
-    {
-        printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08lx page_table_page=%d\n",
-               eff_guest_pfn, eff_guest_mfn, shadow_mfn,
-               mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
-               page_table_page);
-        FAIL("RW coherence");
-    }
-
-    if ( (level == 1) &&
-         (l1e_get_flags(shadow_pte) & _PAGE_RW ) &&
-         !(guest_writable && (l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY)) )
-    {
-        printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08lx page_table_page=%d\n",
-               eff_guest_pfn, eff_guest_mfn, shadow_mfn,
-               mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
-               page_table_page);
-        FAIL("RW2 coherence");
-    }
-
-    if ( eff_guest_mfn == shadow_mfn )
-    {
-        if ( level > 1 )
-            FAIL("Linear map ???");    /* XXX this will fail on BSD */
-    }
-    else
-    {
-        if ( level < 2 )
-            FAIL("Shadow in L1 entry?");
-
-        if ( level == 2 )
-        {
-            if ( __shadow_status(d, eff_guest_pfn, PGT_l1_shadow) != shadow_mfn )
-                FAIL("shadow_mfn problem eff_guest_pfn=%lx shadow_mfn=%lx", eff_guest_pfn,
-                     __shadow_status(d, eff_guest_pfn, PGT_l1_shadow));
-        }
-        else
-            BUG(); // XXX -- not handled yet.
-    }
-
-    return errors;
-}
-#undef FAIL
-#undef v2m
-
-static int check_l1_table(
-    struct vcpu *v, unsigned long gpfn,
-    unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
-{
-    struct domain *d = v->domain;
-    int i;
-    unsigned long snapshot_mfn;
-    l1_pgentry_t *p_guest, *p_shadow, *p_snapshot = NULL;
-    int errors = 0;
-
-    if ( page_out_of_sync(mfn_to_page(gmfn)) )
-    {
-        snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot);
-        ASSERT(snapshot_mfn);
-        p_snapshot = map_domain_page(snapshot_mfn);
-    }
-
-    p_guest  = map_domain_page(gmfn);
-    p_shadow = map_domain_page(smfn);
-
-    for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
-        errors += check_pte(v, p_guest+i, p_shadow+i,
-                            p_snapshot ? p_snapshot+i : NULL,
-                            1, l2_idx, i);
-
-    unmap_domain_page(p_shadow);
-    unmap_domain_page(p_guest);
-    if ( p_snapshot )
-        unmap_domain_page(p_snapshot);
-
-    return errors;
-}
-
-#define FAILPT(_f, _a...)                                         \
-    do {                                                          \
-        printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \
-        errors++;                                                 \
-    } while ( 0 )
-
-static int check_l2_table(
-    struct vcpu *v, unsigned long gmfn, unsigned long smfn, int oos_pdes)
-{
-    struct domain *d = v->domain;
-    l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_page(gmfn);
-    l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_page(smfn);
-    l2_pgentry_t match;
-    int i;
-    int errors = 0;
-    int limit;
-
-    if ( !oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != d) )
-        FAILPT("domain doesn't own page");
-    if ( oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != NULL) )
-        FAILPT("bogus owner for snapshot page");
-    if ( page_get_owner(mfn_to_page(smfn)) != NULL )
-        FAILPT("shadow page mfn=0x%lx is owned by someone, domid=%d",
-               smfn, page_get_owner(mfn_to_page(smfn))->domain_id);
-
-#if 0
-    if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
-                &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
-                ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
-                 DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) )
-    {
-        for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-              i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT);
-              i++ )
-            printk("+++ (%d) %lx %lx\n",i,
-                   l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]));
-        FAILPT("hypervisor entries inconsistent");
-    }
-
-    if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
-          l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
-        FAILPT("hypervisor linear map inconsistent");
-#endif
-
-    match = l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
-    if ( !shadow_mode_external(d) &&
-         l2e_has_changed(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT],
-                         match, PAGE_FLAG_MASK))
-    {
-        FAILPT("hypervisor shadow linear map inconsistent %" PRIpte " %" PRIpte,
-               l2e_get_intpte(spl2e[SH_LINEAR_PT_VIRT_START >>
-                                   L2_PAGETABLE_SHIFT]),
-               l2e_get_intpte(match));
-    }
-
-    match = l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
-    if ( !shadow_mode_external(d) &&
-         l2e_has_changed(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT],
-                         match, PAGE_FLAG_MASK))
-    {
-        FAILPT("hypervisor per-domain map inconsistent saw %" PRIpte ", expected (va=%p) %" PRIpte,
-               l2e_get_intpte(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]),
-               d->arch.mm_perdomain_pt,
-               l2e_get_intpte(match));
-    }
-
-#if CONFIG_PAGING_LEVELS == 2
-    if ( shadow_mode_external(d) )
-        limit = L2_PAGETABLE_ENTRIES;
-    else
-        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-#else
-    limit = 0; /* XXX x86/64 XXX */
-#endif
-
-    /* Check the whole L2. */
-    for ( i = 0; i < limit; i++ )
-        errors += check_pte(v,
-                            (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */
-                            (l1_pgentry_t*)(&spl2e[i]),
-                            NULL,
-                            2, i, 0);
-
-    unmap_domain_page(spl2e);
-    unmap_domain_page(gpl2e);
-
-#if 1
-    if ( errors )
-        printk("check_l2_table returning %d errors\n", errors);
-#endif
-
-    return errors;
-}
-#undef FAILPT
-
-int _check_pagetable(struct vcpu *v, char *s)
-{
-    struct domain *d = v->domain;
-#if CONFIG_PAGING_LEVELS == 4
-    pagetable_t pt = ((v->arch.flags & TF_kernel_mode)?
-                      v->arch.guest_table : v->arch.guest_table_user);
-#else
-    pagetable_t pt = v->arch.guest_table;
-#endif
-    unsigned long gptbase = pagetable_get_paddr(pt);
-    unsigned long ptbase_pfn, smfn;
-    unsigned long i;
-    l2_pgentry_t *gpl2e, *spl2e;
-    unsigned long ptbase_mfn = 0;
-    int errors = 0, limit, oos_pdes = 0;
-
-    //_audit_domain(d, AUDIT_QUIET);
-    shadow_lock(d);
-
-    sh_check_name = s;
-    //SH_VVLOG("%s-PT Audit", s);
-    sh_l2_present = sh_l1_present = 0;
-    perfc_incrc(check_pagetable);
-
-    ptbase_mfn = gptbase >> PAGE_SHIFT;
-    ptbase_pfn = mfn_to_gmfn(d, ptbase_mfn);
-
-    if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
-    {
-        printk("%s-PT %lx not shadowed\n", s, gptbase);
-        goto out;
-    }
-    if ( page_out_of_sync(mfn_to_page(ptbase_mfn)) )
-    {
-        ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
-        oos_pdes = 1;
-        ASSERT(ptbase_mfn);
-    }
-
-    errors += check_l2_table(v, ptbase_mfn, smfn, oos_pdes);
-
-    gpl2e = (l2_pgentry_t *) map_domain_page(ptbase_mfn);
-    spl2e = (l2_pgentry_t *) map_domain_page(smfn);
-
-    /* Go back and recurse. */
-#if CONFIG_PAGING_LEVELS == 2
-    if ( shadow_mode_external(d) )
-        limit = L2_PAGETABLE_ENTRIES;
-    else
-        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-#else
-    limit = 0; /* XXX x86/64 XXX */
-#endif
-
-    for ( i = 0; i < limit; i++ )
-    {
-        unsigned long gl1pfn = l2e_get_pfn(gpl2e[i]);
-        unsigned long gl1mfn = gmfn_to_mfn(d, gl1pfn);
-        unsigned long sl1mfn = l2e_get_pfn(spl2e[i]);
-
-        if ( l2e_get_intpte(spl2e[i]) != 0 )  /* FIXME: check flags? */
-        {
-            errors += check_l1_table(v, gl1pfn, gl1mfn, sl1mfn, i);
-        }
-    }
-
-    unmap_domain_page(spl2e);
-    unmap_domain_page(gpl2e);
-
-#if 0
-    SH_VVLOG("PT verified : l2_present = %d, l1_present = %d",
-             sh_l2_present, sh_l1_present);
-#endif
-
- out:
-    if ( errors )
-        BUG();
-
-    shadow_unlock(d);
-
-    return errors;
-}
-
-int _check_all_pagetables(struct vcpu *v, char *s)
-{
-    struct domain *d = v->domain;
-    int i;
-    struct shadow_status *a;
-    unsigned long gmfn;
-    int errors = 0;
-
-    shadow_status_noswap = 1;
-
-    sh_check_name = s;
-    SH_VVLOG("%s-PT Audit domid=%d", s, d->domain_id);
-    sh_l2_present = sh_l1_present = 0;
-    perfc_incrc(check_all_pagetables);
-
-    for (i = 0; i < shadow_ht_buckets; i++)
-    {
-        a = &d->arch.shadow_ht[i];
-        while ( a && a->gpfn_and_flags )
-        {
-            gmfn = gmfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
-
-            switch ( a->gpfn_and_flags & PGT_type_mask )
-            {
-            case PGT_l1_shadow:
-                errors += check_l1_table(v, a->gpfn_and_flags & PGT_mfn_mask,
-                                         gmfn, a->smfn, 0);
-                break;
-            case PGT_l2_shadow:
-                errors += check_l2_table(v, gmfn, a->smfn,
-                                         page_out_of_sync(mfn_to_page(gmfn)));
-                break;
-            case PGT_l3_shadow:
-            case PGT_l4_shadow:
-            case PGT_hl2_shadow:
-                BUG(); // XXX - ought to fix this...
-                break;
-            case PGT_snapshot:
-            case PGT_writable_pred:
-                break;
-            default:
-                errors++;
-                printk("unexpected shadow type %lx, gpfn=%lx, "
-                       "gmfn=%lx smfn=%lx\n",
-                       a->gpfn_and_flags & PGT_type_mask,
-                       a->gpfn_and_flags & PGT_mfn_mask,
-                       gmfn, a->smfn);
-                BUG();
-            }
-            a = a->next;
-        }
-    }
-
-    shadow_status_noswap = 0;
-
-    if ( errors )
-        BUG();
-
-    return errors;
-}
-
-#endif // SHADOW_DEBUG
-#endif // this code has not been updated for 32pae & 64 bit modes
-
-#if CONFIG_PAGING_LEVELS >= 3
-/****************************************************************************/
-/* 64-bit shadow-mode code testing */
-/****************************************************************************/
-/*
- * init_bl2() is for 32-bit VMX guest on 64-bit host
- * Using 1 shadow L4(l3) and 4 shadow L2s to simulate guest L2
- */
-static inline unsigned long init_bl2(
-    struct domain *d, unsigned long gpfn, unsigned long gmfn)
-{
-    unsigned int count;
-    unsigned long sl2mfn;
-    unsigned long smfn;
-    struct page_info *page;
-    l4_pgentry_t *spl4e;
-    void *l2;
-
-    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l4_shadow))) )
-    {
-        printk("Couldn't alloc an L4 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
-        /* XXX Deal gracefully with failure. */
-        domain_crash_synchronous();
-    }
-
-    spl4e = (l4_pgentry_t *)map_domain_page(smfn);
-
-    /* Map the self entry, L4&L3 share the same page */
-    spl4e[PAE_SHADOW_SELF_ENTRY] = l4e_from_pfn(smfn, __PAGE_HYPERVISOR);
-
-    /* Allocate 4 shadow L2s */
-    page = alloc_domheap_pages(NULL, SL2_ORDER, 0);
-    if ( !page )
-        domain_crash_synchronous();
-
-    for ( count = 0; count < PAE_L3_PAGETABLE_ENTRIES; count++ )
-    {
-        sl2mfn = page_to_mfn(page+count);
-        l2 = map_domain_page(sl2mfn);
-        memset(l2, 0, PAGE_SIZE);
-        unmap_domain_page(l2);
-        spl4e[count] = l4e_from_pfn(sl2mfn, _PAGE_PRESENT);
-    }
-
-    unmap_domain_page(spl4e);
-
-    return smfn;
-}
-
-static inline unsigned long init_l3(
-    struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
-{
-    unsigned long smfn;
-    l4_pgentry_t *spl4e;
-    unsigned long index;
-
-    if ( unlikely(!(smfn = alloc_shadow_page(v->domain, gpfn, gmfn, PGT_l4_shadow))) )
-    {
-        printk("Couldn't alloc an L4 shadow for pfn= %lx mfn= %lx\n", gpfn, gmfn);
-        BUG(); /* XXX Deal gracefully wiht failure. */
-    }
-
-    /* Map the self entry, L4&L3 share the same page */
-    spl4e = (l4_pgentry_t *)map_domain_page(smfn);
-
-    /*
-     * Shadow L4's pfn_info->tlbflush_timestamp
-     * should also save it's own index.
-     */
-
-    index = get_cr3_idxval(v);
-    frame_table[smfn].tlbflush_timestamp = index;
-
-    memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
-    spl4e[PAE_SHADOW_SELF_ENTRY] = l4e_from_pfn(smfn, __PAGE_HYPERVISOR);
-    unmap_domain_page(spl4e);
-    return smfn;
-}
-#endif
-
-#if CONFIG_PAGING_LEVELS == 3
-static unsigned long shadow_l3_table(
-    struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
-{
-    unsigned long smfn;
-    l3_pgentry_t *spl3e;
-    struct domain *d = v->domain;
-
-    perfc_incrc(shadow_l3_table_count);
-
-    SH_VVLOG("shadow_l3_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
-
-    if ( SH_L1_HAS_NEXT_PAGE &&
-         d->arch.ops->guest_paging_levels == PAGING_L2 )
-    {
-        return init_bl2(d, gpfn, gmfn);
-    }
-
-    if ( SH_GUEST_32PAE &&
-         d->arch.ops->guest_paging_levels == PAGING_L3 )
-    {
-        return init_l3(v, gpfn, gmfn);
-    }
-
-    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l3_shadow))) )
-    {
-            printk("Couldn't alloc an L3 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
-            BUG(); /* XXX Deal gracefully with failure. */
-    }
-
-    spl3e = (l3_pgentry_t *)map_domain_page(smfn);
-
-    /* Make the self entry */
-    spl3e[PAE_SHADOW_SELF_ENTRY] = l3e_from_pfn(smfn, __PAGE_HYPERVISOR);
-
-    if ( (PGT_base_page_table == PGT_l3_page_table) &&
-         !shadow_mode_external(d) ) {
-        int i;
-        unsigned long g2mfn, s2mfn;
-        l2_pgentry_t *spl2e;
-        l3_pgentry_t *gpl3e;
-
-        /* Get the top entry */
-        gpl3e = (l3_pgentry_t *)map_domain_page(gmfn);
-
-        if ( !(l3e_get_flags(gpl3e[L3_PAGETABLE_ENTRIES - 1]) & _PAGE_PRESENT) )
-        {
-            BUG();
-        }
-
-        g2mfn = l3e_get_pfn(gpl3e[L3_PAGETABLE_ENTRIES - 1]);
-
-        /* NB. g2mfn should be same as g2pfn */
-        if (!(s2mfn = __shadow_status(d, g2mfn, PGT_l2_shadow))) {
-            if ( unlikely(!(s2mfn =
-                    alloc_shadow_page(d, g2mfn, g2mfn, PGT_l2_shadow))) ) {
-                printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n",
-                    g2mfn, g2mfn);
-                BUG(); /* XXX Deal gracefully with failure. */
-            }
-        } 
-
-        if (!get_shadow_ref(s2mfn))
-            BUG();
-            
-        /* Map shadow L2 into shadow L3 */
-        spl3e[L3_PAGETABLE_ENTRIES - 1] = l3e_from_pfn(s2mfn, _PAGE_PRESENT);
-        shadow_update_min_max(smfn, L3_PAGETABLE_ENTRIES -1);
-
-        /*  
-         * Xen private mappings. Do the similar things as
-         * create_pae_xen_mappings().
-         */
-        spl2e = (l2_pgentry_t *)map_domain_page(s2mfn);
-
-        /*
-         * When we free L2 pages, we need to tell if the page contains
-         * Xen private mappings. Use the va_mask part.
-         */
-        mfn_to_page(s2mfn)->u.inuse.type_info |= 
-            (unsigned long) 3 << PGT_score_shift;
-
-        memset(spl2e, 0, 
-               (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)) * sizeof(l2_pgentry_t));
-
-        memcpy(&spl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
-           &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
-           L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));       
-
-        for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
-            spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
-                l2e_from_page(
-                    virt_to_page(page_get_owner(mfn_to_page(gmfn))->arch.mm_perdomain_pt) + i, 
-                    __PAGE_HYPERVISOR);
-        for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
-            spl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
-                (l3e_get_flags(gpl3e[i]) & _PAGE_PRESENT) ?
-                l2e_from_pfn(l3e_get_pfn(gpl3e[i]), __PAGE_HYPERVISOR) :
-                l2e_empty();
-       
-        unmap_domain_page(spl2e);
-        unmap_domain_page(gpl3e);
-    }
-    unmap_domain_page(spl3e);
-
-    return smfn;
-}
-#endif /* CONFIG_PAGING_LEVELS == 3 */
-
-#if (!defined(GUEST_PGENTRY_32) && !defined(GUEST_32PAE))
-static unsigned long gva_to_gpa_pae(unsigned long gva)
-{
-    BUG();
-    return 43;
-}
-#endif
-
-#if CONFIG_PAGING_LEVELS == 4
-static unsigned long shadow_l4_table(
-  struct vcpu *v, unsigned long gpfn, unsigned long gmfn)
-{
-    unsigned long smfn;
-    l4_pgentry_t *spl4e;
-    struct domain *d = v->domain;
-
-    SH_VVLOG("shadow_l4_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
-
-    perfc_incrc(shadow_l4_table_count);
-
-    if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
-    {
-        return init_bl2(d, gpfn, gmfn);
-    }
-
-    if ( SH_GUEST_32PAE && d->arch.ops->guest_paging_levels == PAGING_L3 )
-    {
-        return init_l3(v, gpfn, gmfn);
-    }
-
-    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l4_shadow))) )
-    {
-        printk("Couldn't alloc an L4 shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
-        BUG(); /* XXX Deal gracefully with failure. */
-    }
-
-    spl4e = (l4_pgentry_t *)map_domain_page(smfn);
-
-    /* Install hypervisor and 4x linear p.t. mapings. */
-    if ( (PGT_base_page_table == PGT_l4_page_table) &&
-      !shadow_mode_external(d) )
-    {
-        /*
-         * We could proactively fill in PDEs for pages that are already
-         * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
-         * (restriction required for coherence of the accessed bit). However,
-         * we tried it and it didn't help performance. This is simpler.
-         */
-        memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
-
-        /* Install hypervisor and 2x linear p.t. mapings. */
-        memcpy(&spl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
-           &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
-           ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
-
-        spl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
-            l4e_from_paddr(__pa(page_get_owner(mfn_to_page(gmfn))->arch.mm_perdomain_l3),
-                            __PAGE_HYPERVISOR);
-
-        if ( shadow_mode_translate(d) ) // NB: not external
-        {
-            spl4e[l4_table_offset(RO_MPT_VIRT_START)] =
-                l4e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
-                                __PAGE_HYPERVISOR);
-        }
-        else
-            spl4e[l4_table_offset(LINEAR_PT_VIRT_START)] =
-                l4e_from_pfn(gmfn, __PAGE_HYPERVISOR);
-
-    } else
-        memset(spl4e, 0, L4_PAGETABLE_ENTRIES*sizeof(l4_pgentry_t));
-
-    unmap_domain_page(spl4e);
-
-    ESH_LOG("shadow_l4_table(%lx -> %lx)", gmfn, smfn);
-    return smfn;
-}
-#endif /* CONFIG_PAGING_LEVELS == 4 */
-
-#if CONFIG_PAGING_LEVELS >= 3
-static void 
-update_top_level_shadow(struct vcpu *v, unsigned long smfn)
-{
-    unsigned long index = get_cr3_idxval(v);
-    pgentry_64_t *sple = (pgentry_64_t *)map_domain_page(smfn);
-    pgentry_64_t *gple = (pgentry_64_t *)&v->arch.guest_vtable;
-    int i;
-
-    for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ )
-    {
-        unsigned long gpfn;
-
-        /*
-         * Looks like it's no longer a page table. 
-         */
-        if ( unlikely(entry_get_value(gple[index*4+i]) & PAE_PDPT_RESERVED) )
-        {
-            if ( entry_get_flags(sple[i]) & _PAGE_PRESENT )
-                put_shadow_ref(entry_get_pfn(sple[i]));
-
-            sple[i] = entry_empty();
-            continue;
-        }
-
-        gpfn = entry_get_pfn(gple[index*4+i]);
-
-        if ( unlikely(gpfn != (gpfn & PGT_mfn_mask)) )
-        {
-            if ( entry_get_flags(sple[i]) & _PAGE_PRESENT )
-                put_shadow_ref(entry_get_pfn(sple[i]));
-
-            sple[i] = entry_empty();
-            continue;
-        }
-
-        validate_entry_change(
-            v->domain, &gple[index*4+i], &sple[i], PAGING_L3);
-    }
-
-    unmap_domain_page(sple);
-}
-
-/*
- * validate_bl2e_change()
- * The code is for 32-bit HVM guest on 64-bit host.
- * To sync guest L2.
- */
-
-static inline void
-validate_bl2e_change(
-    struct domain *d,
-    guest_root_pgentry_t *new_gle_p,
-    pgentry_64_t *shadow_l3,
-    int index)
-{
-    int sl3_idx, sl2_idx;
-    unsigned long sl2mfn, sl1mfn;
-    pgentry_64_t *sl2_p;
-
-    /* Using guest l2 pte index to get shadow l3&l2 index
-     * index: 0 ~ 1023, PAGETABLE_ENTRIES: 512
-     */
-    sl3_idx = index / (PAGETABLE_ENTRIES / 2);
-    sl2_idx = (index % (PAGETABLE_ENTRIES / 2)) * 2;
-
-    sl2mfn = entry_get_pfn(shadow_l3[sl3_idx]);
-    sl2_p = (pgentry_64_t *)map_domain_page(sl2mfn);
-
-    validate_pde_change(
-        d, *(guest_l2_pgentry_t *)new_gle_p, (l2_pgentry_t *)&sl2_p[sl2_idx]);
-
-    /* Mapping the second l1 shadow page */
-    if (entry_get_flags(sl2_p[sl2_idx]) & _PAGE_PRESENT) {
-       sl1mfn = entry_get_pfn(sl2_p[sl2_idx]);
-       sl2_p[sl2_idx + 1] =
-            entry_from_pfn(sl1mfn + 1, entry_get_flags(sl2_p[sl2_idx]));
-    }
-    else
-        sl2_p[sl2_idx + 1] = (pgentry_64_t){0};
-    unmap_domain_page(sl2_p);
-
-}
-
-/*
- * This shadow_mark_va_out_of_sync() is for 2M page shadow
- */
-static void shadow_mark_va_out_of_sync_2mp(
-  struct vcpu *v, unsigned long gpfn, unsigned long mfn, paddr_t writable_pl1e)
-{
-    struct out_of_sync_entry *entry =
-      shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
-
-    entry->writable_pl1e = writable_pl1e;
-    ESH_LOG("<shadow_mark_va_out_of_sync_2mp> gpfn = %lx\n", gpfn);
-    if ( !get_shadow_ref(writable_pl1e >> L1_PAGETABLE_SHIFT) )
-        BUG();
-}
-
-static int get_shadow_mfn(struct domain *d, unsigned long gpfn, unsigned long *spmfn, u32 flag)
-{
-    unsigned long gmfn;
-    if ( !(*spmfn = __shadow_status(d, gpfn, flag)) )
-    {
-        /* This is NOT already shadowed so we need to shadow it. */
-        SH_VVLOG("<get_shadow_mfn>: not shadowed");
-
-        gmfn = gmfn_to_mfn(d, gpfn);
-        if ( unlikely(!VALID_MFN(gmfn)) )
-        {
-            // Attempt to use an invalid pfn as an shadow page.
-            // XXX this needs to be more graceful!
-            BUG();
-        }
-
-        if ( unlikely(!(*spmfn =
-                  alloc_shadow_page(d, gpfn, gmfn, flag))) )
-        {
-            printk("<get_shadow_mfn>Couldn't alloc an shadow for pfn=%lx mfn=%lx\n", gpfn, gmfn);
-            BUG(); /* XXX Need to deal gracefully with failure. */
-        }
-        switch(flag) {
-            case PGT_l1_shadow:
-                perfc_incrc(shadow_l1_table_count);
-                break;
-            case PGT_l2_shadow:
-                perfc_incrc(shadow_l2_table_count);
-                break;
-            case PGT_l3_shadow:
-                perfc_incrc(shadow_l3_table_count);
-                break;
-            case PGT_hl2_shadow:
-                perfc_incrc(shadow_hl2_table_count);
-                break;
-        }
-
-        return 1;
-    } else {
-        /* This L1 is shadowed already, but the L2 entry is missing. */
-        SH_VVLOG("4b: was shadowed, l2 missing (%lx)", *spmfn);
-        return 0;
-    }
-}
-
-static void shadow_map_into_current(struct vcpu *v,
-  unsigned long va, unsigned int from, unsigned int to)
-{
-    pgentry_64_t gle = {0}, sle;
-    unsigned long gpfn, smfn;
-
-    if (from == PAGING_L1 && to == PAGING_L2) {
-        shadow_map_l1_into_current_l2(va);
-        return;
-    }
-
-    __rw_entry(v, va, &gle, GUEST_ENTRY | GET_ENTRY | to);
-    ASSERT(entry_get_flags(gle) & _PAGE_PRESENT);
-    gpfn = entry_get_pfn(gle);
-
-    get_shadow_mfn(v->domain, gpfn, &smfn, shadow_level_to_type(from));
-
-    if ( !get_shadow_ref(smfn) )
-        BUG();
-    entry_general(v->domain, &gle, &sle, smfn, to);
-    __rw_entry(v, va, &gle, GUEST_ENTRY | SET_ENTRY | to);
-    __rw_entry(v, va, &sle, SHADOW_ENTRY | SET_ENTRY | to);
-}
-
-/*
- * shadow_set_lxe should be put in shadow.h
- */
-static void shadow_set_l2e_64(unsigned long va, l2_pgentry_t sl2e,
-  int create_l2_shadow, int put_ref_check)
-{
-    struct vcpu *v = current;
-    l4_pgentry_t sl4e;
-    l3_pgentry_t sl3e;
-
-    __shadow_get_l4e(v, va, &sl4e);
-    if (!(l4e_get_flags(sl4e) & _PAGE_PRESENT)) {
-        if (create_l2_shadow) {
-            perfc_incrc(shadow_set_l3e_force_map);
-            shadow_map_into_current(v, va, PAGING_L3, PAGING_L4);
-            __shadow_get_l4e(v, va, &sl4e);
-        } else {
-            printk("For non HVM shadow, create_l1_shadow:%d\n", create_l2_shadow);
-        }
-    }
-
-    __shadow_get_l3e(v, va, &sl3e);
-    if (!(l3e_get_flags(sl3e) & _PAGE_PRESENT)) {
-        if (create_l2_shadow) {
-            perfc_incrc(shadow_set_l2e_force_map);
-            shadow_map_into_current(v, va, PAGING_L2, PAGING_L3);
-            __shadow_get_l3e(v, va, &sl3e);
-        } else {
-            printk("For non HVM shadow, create_l1_shadow:%d\n", create_l2_shadow);
-        }
-
-        if ( v->domain->arch.ops->guest_paging_levels == PAGING_L4 )
-            shadow_update_min_max(l4e_get_pfn(sl4e), l3_table_offset(va));
-    }
-
-    if ( put_ref_check ) {
-        l2_pgentry_t tmp_sl2e;
-        if ( __shadow_get_l2e(v, va, &tmp_sl2e) ) {
-            if ( l2e_get_flags(tmp_sl2e) & _PAGE_PRESENT )
-                if ( l2e_get_pfn(tmp_sl2e) == l2e_get_pfn(sl2e) ) {
-                    put_shadow_ref(l2e_get_pfn(sl2e));
-                }
-        }
-
-    }
-
-    if (! __shadow_set_l2e(v, va, &sl2e))
-        BUG();
-    shadow_update_min_max(l3e_get_pfn(sl3e), l2_table_offset(va));
-}
-
-
-/* As 32-bit guest don't support 4M page yet,
- * we don't concern double compile for this function
- */
-static inline int l2e_rw_fault(
-    struct vcpu *v, l2_pgentry_t *gl2e_p, unsigned long va, int rw)
-{
-    struct domain *d = v->domain;
-    l2_pgentry_t gl2e = *gl2e_p;
-    l2_pgentry_t tmp_l2e = gl2e;
-    unsigned long start_gpfn = l2e_get_pfn(gl2e);
-    unsigned long gpfn, mfn;
-    unsigned long l1_mfn, gmfn;
-    l1_pgentry_t *l1_p;
-    l1_pgentry_t sl1e;
-    l1_pgentry_t old_sl1e;
-    l2_pgentry_t sl2e;
-#ifdef __x86_64__
-    u64 nx = 0;
-#endif
-    int put_ref_check = 0;
-    /* Check if gpfn is 2M aligned */
-
-    /* Update guest l2e */
-    if (rw) {
-        ASSERT(l2e_get_flags(gl2e) & _PAGE_RW);
-        l2e_add_flags(gl2e, _PAGE_DIRTY | _PAGE_ACCESSED);
-    } else {
-        l2e_add_flags(gl2e, _PAGE_ACCESSED);
-    }
-
-    l2e_remove_flags(tmp_l2e, _PAGE_PSE);
-    if (l2e_get_flags(gl2e) & _PAGE_NX) {
-        l2e_remove_flags(tmp_l2e, _PAGE_NX);
-#ifdef __x86_64__
-        nx = PGT_high_mfn_nx;
-#endif
-    }
-
-
-    /* Get the shadow l2 first */
-    if ( !__shadow_get_l2e(v, va, &sl2e) )
-        sl2e = l2e_empty();
-
-#ifdef __x86_64__
-    l1_mfn = __shadow_status(d, start_gpfn | nx, PGT_fl1_shadow);
-#else
-    l1_mfn = __shadow_status(d, start_gpfn, PGT_fl1_shadow);
-#endif
-
-    /* Check the corresponding l2e */
-    if (l1_mfn) {
-        /* Why it is PRESENT?*/
-        if ((l2e_get_flags(sl2e) & _PAGE_PRESENT) &&
-                l2e_get_pfn(sl2e) == l1_mfn) {
-            ESH_LOG("sl2e PRSENT bit is set: %lx, l1_mfn = %lx\n", l2e_get_pfn(sl2e), l1_mfn);
-        } else {
-            put_ref_check = 1;
-            if (!get_shadow_ref(l1_mfn))
-                BUG();
-        }
-        l1_p = (l1_pgentry_t *)map_domain_page(l1_mfn);
-        sl2e = l2e_from_pfn(l1_mfn, l2e_get_flags(tmp_l2e));
-    } else {
-        /* Allocate a new page as shadow page table if need */
-        gmfn = gmfn_to_mfn(d, start_gpfn);
-#ifdef __x86_64__
-        l1_mfn = alloc_shadow_page(d, start_gpfn | nx, gmfn, PGT_fl1_shadow);
-#else
-        l1_mfn = alloc_shadow_page(d, start_gpfn, gmfn, PGT_fl1_shadow);
-#endif
-        if (unlikely(!l1_mfn)) {
-            BUG();
-        }
-
-        if (!get_shadow_ref(l1_mfn))
-            BUG();
-        l1_p = (l1_pgentry_t *)map_domain_page(l1_mfn );
-        sl2e = l2e_from_pfn(l1_mfn, l2e_get_flags(tmp_l2e));
-        memset(l1_p, 0, PAGE_SIZE);
-        ESH_LOG("Alloc a shadow page: %lx\n", l1_mfn);
-    }
-
-    ESH_LOG("<%s>: sl2e = %lx\n", __func__, l2e_get_intpte(sl2e));
-    /* Map the page to l2*/
-    shadow_set_l2e_64(va, sl2e, 1, put_ref_check);
-
-    if (l2e_get_flags(gl2e) & _PAGE_NX)
-        l2e_add_flags(tmp_l2e, _PAGE_NX);
-
-    /* Propagate the shadow page table, i.e. setting sl1e */
-    for (gpfn = start_gpfn;
-      gpfn < (start_gpfn + L1_PAGETABLE_ENTRIES); gpfn++) {
-
-        mfn = gmfn_to_mfn(d, gpfn);
-
-        if ( unlikely(!VALID_MFN(mfn)) )
-        {
-            continue;
-        }
-
-        sl1e = l1e_from_pfn(mfn, l2e_get_flags(tmp_l2e));
-
-        if (!rw) {
-            if ( shadow_mode_log_dirty(d) ||
-              !(l2e_get_flags(gl2e) & _PAGE_DIRTY) || mfn_is_page_table(mfn) )
-            {
-                l1e_remove_flags(sl1e, _PAGE_RW);
-            }
-        } else {
-            /* __mark_dirty(d, gmfn); */
-        }
-       // printk("<%s> gpfn: %lx, mfn: %lx, sl1e: %lx\n", __func__, gpfn, mfn, l1e_get_intpte(sl1e));
-        /* The shadow entrys need setup before shadow_mark_va_out_of_sync()*/
-        old_sl1e = l1_p[gpfn - start_gpfn];
-
-        if ( l1e_has_changed(old_sl1e, sl1e, _PAGE_RW | _PAGE_PRESENT) )
-        {
-            if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
-              !shadow_get_page_from_l1e(sl1e, d) ) {
-                ESH_LOG("%lx, mfn: %lx why make me empty, start_pfn: %lx, gpfn: %lx\n", l1e_get_intpte(sl1e),mfn, start_gpfn, gpfn);
-                sl1e = l1e_empty();
-            }
-            if ( l1e_get_flags(old_sl1e) & _PAGE_PRESENT )
-                put_page_from_l1e(old_sl1e, d);
-        }
-
-        if (rw) {
-            /* shadow_mark_va_out_of_sync() need modificatin for 2M pages*/
-            if ( mfn_is_page_table(mfn) )
-                shadow_mark_va_out_of_sync_2mp(v, gpfn, mfn,
-                  l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * (gpfn - start_gpfn)));
-        }
-
-        l1_p[gpfn - start_gpfn] = sl1e;
-    }
-
-    unmap_domain_page(l1_p);
-    *gl2e_p = gl2e;
-    return 1;
-}
-
-/*
- * Check P, R/W, U/S bits in the guest page table.
- * If the fault belongs to guest return 1,
- * else return 0.
- */
-#if defined( GUEST_PGENTRY_32 )
-static inline int guest_page_fault(
-    struct vcpu *v,
-    unsigned long va, unsigned int error_code,
-    guest_l2_pgentry_t *gpl2e, guest_l1_pgentry_t *gpl1e)
-{
-    /* The following check for 32-bit guest on 64-bit host */
-
-    __guest_get_l2e(v, va, gpl2e);
-
-    /* Check the guest L2 page-table entry first*/
-    if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_PRESENT)) )
-        return 1;
-
-    if ( error_code & ERROR_W ) 
-    {
-        if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_RW)) )
-            return 1;
-    }
-
-    if ( error_code & ERROR_U ) 
-    {
-        if ( unlikely(!(guest_l2e_get_flags(*gpl2e) & _PAGE_USER)) )
-            return 1;
-    }
-
-    if ( guest_l2e_get_flags(*gpl2e) & _PAGE_PSE )
-    {
-        printk("None-PAE HVM guests can NOT use PSE, "
-               "because we don't support 4MBytes PSE pages.\n");
-        printk("remove pae=1 from your config file.\n");
-        domain_crash_synchronous();
-        return 0;
-    }
-
-    __guest_get_l1e(v, va, gpl1e);
-
-    /* Then check the guest L1 page-table entry */
-    if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_PRESENT)) )
-        return 1;
-
-    if ( error_code & ERROR_W ) 
-    {
-        if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_RW)) )
-            return 1;
-    }
-
-    if ( error_code & ERROR_U ) 
-    {
-        if ( unlikely(!(guest_l1e_get_flags(*gpl1e) & _PAGE_USER)) )
-            return 1;
-    }
-
-    return 0;
-}
-#else
-static inline int guest_page_fault(
-    struct vcpu *v,
-    unsigned long va, unsigned int error_code,
-    guest_l2_pgentry_t *gpl2e, guest_l1_pgentry_t *gpl1e)
-{
-    struct domain *d = v->domain;
-    pgentry_64_t gle = { 0 };
-    unsigned long gpfn = 0, mfn;
-    int i;
-    unsigned int base_idx = 0;
-    base_idx = get_cr3_idxval(v);
-
-    ASSERT( d->arch.ops->guest_paging_levels >= PAGING_L3 );
-
-#if CONFIG_PAGING_LEVELS >= 3
-    if ( (error_code & (ERROR_I | ERROR_P)) == (ERROR_I | ERROR_P) )
-        return 1;
-#endif
-
-#if CONFIG_PAGING_LEVELS == 4
-    if ( d->arch.ops->guest_paging_levels == PAGING_L4 )
-    {
-        __rw_entry(v, va, &gle, GUEST_ENTRY | GET_ENTRY | PAGING_L4);
-        if ( unlikely(!(entry_get_flags(gle) & _PAGE_PRESENT)) )
-            return 1;
-
-        if ( error_code & ERROR_W )
-        {
-            if ( unlikely(!(entry_get_flags(gle) & _PAGE_RW)) )
-                return 1;
-        }
-
-        if ( error_code & ERROR_U )
-        {
-            if ( unlikely(!(entry_get_flags(gle) & _PAGE_USER)) )
-                return 1;
-        }
-        gpfn = entry_get_pfn(gle);
-    }
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 3
-    if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
-    {
-        if ( SH_GUEST_32PAE )
-            gpfn = (hvm_get_guest_ctrl_reg(v, 3)) >> PAGE_SHIFT;
-        else
-            gpfn = pagetable_get_pfn(v->arch.guest_table);
-    }
-#endif
-
-    for ( i = PAGING_L3; i >= PAGING_L1; i-- )
-    {
-        pgentry_64_t *lva;
-        /*
-         * If it's not external mode, then mfn should be machine physical.
-         */
-        mfn = gmfn_to_mfn(d, gpfn);
-
-        lva = (pgentry_64_t *) map_domain_page(mfn);
-        gle = lva[guest_table_offset_64(va, i, base_idx)];
-
-        unmap_domain_page(lva);
-
-        gpfn = entry_get_pfn(gle);
-
-        if ( unlikely(!(entry_get_flags(gle) & _PAGE_PRESENT)) )
-            return 1;
-
-        if ( i < PAGING_L3 ||
-             d->arch.ops->guest_paging_levels == PAGING_L4 )
-        {
-            if ( error_code & ERROR_W )
-            {
-                if ( unlikely(!(entry_get_flags(gle) & _PAGE_RW)) )
-                {
-                    if ( i == PAGING_L1 )
-                        if ( gpl1e )
-                            gpl1e->l1 = gle.lo;
-                    return 1;
-                }
-            }
-            if ( error_code & ERROR_U )
-            {
-                if ( unlikely(!(entry_get_flags(gle) & _PAGE_USER)) )
-                    return 1;
-            }
-        }
-
-        if ( i == PAGING_L2 )
-        {
-            if ( gpl2e )
-                gpl2e->l2 = gle.lo;
-            if ( likely(entry_get_flags(gle) & _PAGE_PSE) )
-                return 0;
-        }
-
-        if ( i == PAGING_L1 )
-            if ( gpl1e )
-                gpl1e->l1 = gle.lo;
-    }
-
-    return 0;
-
-}
-#endif
-
-static int shadow_fault_64(unsigned long va, struct cpu_user_regs *regs)
-{
-    struct vcpu *v = current;
-    struct domain *d = v->domain;
-    guest_l2_pgentry_t gl2e;
-    guest_l1_pgentry_t gl1e, orig_gl1e;
-    l1_pgentry_t sl1e;
-
-    gl1e = guest_l1e_empty(); gl2e = guest_l2e_empty();
-
-    sl1e = l1e_empty();
-
-    perfc_incrc(shadow_fault_calls);
-
-    ESH_LOG("<shadow_fault_64> va=%lx,  rip = %lx, error code = %x\n",
-            va, regs->eip, regs->error_code);
-
-    /*
-     * Don't let someone else take the guest's table pages out-of-sync.
-     */
-    shadow_lock(d);
-
-    /*
-     * STEP 1. Check to see if this fault might have been caused by an
-     *         out-of-sync table page entry, or if we should pass this
-     *         fault onto the guest.
-     */
-    __shadow_sync_va(v, va);
-
-    /*
-     * STEP 2. Check if the fault belongs to guest
-     */
-    if ( guest_page_fault(v, va, regs->error_code, &gl2e, &gl1e) ) 
-    {
-        if ( unlikely(shadow_mode_log_dirty(d)) && l1e_get_intpte(gl1e) != 0 )
-            goto check_writeable;
-        
-        goto fail;
-    }
-
-    if ( unlikely((guest_l2e_get_flags(gl2e) & _PAGE_PSE)) ) 
-        goto pse;
-
-    /*
-     * Handle 4K pages here
-     */
-check_writeable:
-    orig_gl1e = gl1e;
-    
-    /* Write fault? */
-    if ( regs->error_code & 2 ) 
-    {
-        int allow_writes = 0;
-
-        if ( unlikely(!(guest_l1e_get_flags(gl1e) & _PAGE_RW)) )
-        {
-            if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gl1e)) )
-            {
-                allow_writes = 1;
-                l1e_add_flags(gl1e, _PAGE_RW);
-            }
-            else
-            {
-                /* Write fault on a read-only mapping. */
-                SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")", 
-                         l1e_get_intpte(gl1e));
-                perfc_incrc(shadow_fault_bail_ro_mapping);
-                goto fail;
-            }
-        }
-
-        if ( !l1pte_write_fault(v, &gl1e, &sl1e, va) ) 
-        {
-            SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
-            perfc_incrc(write_fault_bail);
-            shadow_unlock(d);
-            return 0;
-        }
-        if (allow_writes)
-            l1e_remove_flags(gl1e, _PAGE_RW);
-    }
-    else 
-    {
-        if ( !l1pte_read_fault(d, &gl1e, &sl1e) )
-        {
-            SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
-            perfc_incrc(read_fault_bail);
-            shadow_unlock(d);
-            return 0;
-        }
-    }
-
-    /*
-     * STEP 3. Write the modified shadow PTE and guest PTE back to the tables
-     */
-    if ( l1e_has_changed(orig_gl1e, gl1e, PAGE_FLAG_MASK) )
-    {
-        if (unlikely(!__guest_set_l1e(v, va, &gl1e))) 
-            domain_crash_synchronous();
-
-        __mark_dirty(d, gmfn_to_mfn(d, l2e_get_pfn(gl2e)));
-    }
-
-    shadow_set_l1e_64(va, (pgentry_64_t *)&sl1e, 1);
-
-    perfc_incrc(shadow_fault_fixed);
-    d->arch.shadow_fault_count++;
-
-    shadow_unlock(d);
-
-    return EXCRET_fault_fixed;
-
-pse:
-    /*
-     * Handle 2M pages here
-     */
-    if ( unlikely(!shadow_mode_external(d)) )
-        BUG();
-
-    /* Write fault? */
-    if ( regs->error_code & 2 ) 
-    {
-        if ( !l2e_rw_fault(v, (l2_pgentry_t *)&gl2e, va, WRITE_FAULT) ) 
-        {
-            goto fail;
-        }
-    } 
-    else 
-    {
-        l2e_rw_fault(v, (l2_pgentry_t *)&gl2e, va, READ_FAULT);
-    }
-
-    /*
-     * STEP 3. Write guest/shadow l2e back
-     */
-
-    if ( unlikely(!__guest_set_l2e(v, va, &gl2e)) ) 
-    {
-        domain_crash_synchronous();
-    }
-
-    /*
-     * Todo: if necessary, record the page table page as dirty
-     */
-
-    perfc_incrc(shadow_fault_fixed);
-    d->arch.shadow_fault_count++;
-
-    shadow_unlock(d);
-
-    return EXCRET_fault_fixed;
-fail:
-    shadow_unlock(d);
-    ESH_LOG("Guest fault~~~\n");
-    return 0;
-}
-
-static void shadow_invlpg_64(struct vcpu *v, unsigned long va)
-{
-    struct domain *d = v->domain;
-    l1_pgentry_t  sl1e, old_sl1e;
-
-    shadow_lock(d);
-
-    __shadow_sync_va(v, va);
-
-    if ( shadow_mode_external(d) && __shadow_get_l1e(v, va, &old_sl1e) )
-        if ( l1e_get_flags(old_sl1e) & _PAGE_PRESENT )
-            put_page_from_l1e(old_sl1e, d);
-
-    sl1e = l1e_empty();
-    __shadow_set_l1e(v, va, &sl1e);
-
-    shadow_unlock(d);
-}
-
-static unsigned long gva_to_gpa_64(unsigned long gva)
-{
-    struct vcpu *v = current;
-    guest_l1_pgentry_t gl1e = {0};
-    guest_l2_pgentry_t gl2e = {0};
-    unsigned long gpa;
-
-    if (guest_page_fault(v, gva, 0, &gl2e, &gl1e))
-        return 0;
-
-    if (guest_l2e_get_flags(gl2e) & _PAGE_PSE)
-        gpa = guest_l2e_get_paddr(gl2e) + (gva & ((1 << GUEST_L2_PAGETABLE_SHIFT) - 1));
-    else
-        gpa = guest_l1e_get_paddr(gl1e) + (gva & ~PAGE_MASK);
-
-    return gpa;
-}
-
-/*
- * The naming convention of the shadow_ops:
- * MODE_<pgentry size>_<guest paging levels>_HANDLER
- */
-#if (!defined(GUEST_PGENTRY_32) && !defined(GUEST_32PAE))
-struct shadow_ops MODE_64_3_HANDLER = {
-    .guest_paging_levels        = 3,
-    .invlpg                     = shadow_invlpg_64,
-    .fault                      = shadow_fault_64,
-    .update_pagetables          = shadow_update_pagetables,
-    .sync_all                   = sync_all,
-    .remove_all_write_access    = remove_all_write_access,
-    .do_update_va_mapping       = do_update_va_mapping,
-    .mark_mfn_out_of_sync       = mark_mfn_out_of_sync,
-    .is_out_of_sync             = is_out_of_sync,
-    .gva_to_gpa                 = gva_to_gpa_pae,
-};
-
-struct shadow_ops MODE_64_4_HANDLER = {
-    .guest_paging_levels        = 4,
-    .invlpg                     = shadow_invlpg_64,
-    .fault                      = shadow_fault_64,
-    .update_pagetables          = shadow_update_pagetables,
-    .sync_all                   = sync_all,
-    .remove_all_write_access    = remove_all_write_access,
-    .do_update_va_mapping       = do_update_va_mapping,
-    .mark_mfn_out_of_sync       = mark_mfn_out_of_sync,
-    .is_out_of_sync             = is_out_of_sync,
-    .gva_to_gpa                 = gva_to_gpa_64,
-};
-#endif /* GUEST_PGENTRY_32 */
-#endif /* CONFIG_PAGING_LEVELS >= 3 */
-
-
-#if CONFIG_PAGING_LEVELS == 2
-struct shadow_ops MODE_32_2_HANDLER = {
-    .guest_paging_levels        = 2,
-    .invlpg                     = shadow_invlpg_32,
-    .fault                      = shadow_fault_32,
-    .update_pagetables          = shadow_update_pagetables,
-    .sync_all                   = sync_all,
-    .remove_all_write_access    = remove_all_write_access,
-    .do_update_va_mapping       = do_update_va_mapping,
-    .mark_mfn_out_of_sync       = mark_mfn_out_of_sync,
-    .is_out_of_sync             = is_out_of_sync,
-    .gva_to_gpa                 = gva_to_gpa_64,
-};
-#endif
-
-#if ( CONFIG_PAGING_LEVELS == 3 && !defined (GUEST_PGENTRY_32) && !defined (GUEST_32PAE) ) ||  \
-    ( CONFIG_PAGING_LEVELS == 4 && defined (GUEST_PGENTRY_32) ) 
-
-
-/* 
- * Use GUEST_PGENTRY_32 to force PAE_SHADOW_SELF_ENTRY for L4.
- *
- * Very simple shadow code to handle 1:1 direct mapping for guest 
- * non-paging code, which actually is running in PAE/vm86 mode with 
- * paging-enabled.
- *
- * We expect that the top level (L3) page has been allocated and initialized.
- */
-int shadow_direct_map_fault(unsigned long vpa, struct cpu_user_regs *regs)
-{
-    struct vcpu *v = current;
-    struct domain *d = v->domain;
-    l3_pgentry_t sl3e, *sl3e_p;
-    l2_pgentry_t sl2e, *sl2e_p;
-    l1_pgentry_t sl1e;
-    unsigned long mfn, smfn;
-    struct page_info *page;
-
-    /*
-     * If the faulting address is within the MMIO range, we continue
-     * on handling the #PF as such.
-     */
-    if ( (mfn = get_mfn_from_gpfn(vpa >> PAGE_SHIFT)) == INVALID_MFN )
-        return 0;
-
-    shadow_lock(d);
-
-    __direct_get_l3e(v, vpa, &sl3e);
-
-    if ( !(l3e_get_flags(sl3e) & _PAGE_PRESENT) )
-    {
-        page = alloc_domheap_page(NULL);
-        if ( !page )
-            goto nomem;
-
-        smfn = page_to_mfn(page);
-        sl3e = l3e_from_pfn(smfn, _PAGE_PRESENT);
-
-        sl3e_p = (l3_pgentry_t *)map_domain_page(smfn);
-        memset(sl3e_p, 0, PAGE_SIZE);
-        unmap_domain_page(sl3e_p);
-
-        __direct_set_l3e(v, vpa, &sl3e);
-    }
-
-    __direct_get_l2e(v, vpa, &sl2e);
-
-    if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
-    {
-        page = alloc_domheap_page(NULL);
-        if ( !page )
-            goto nomem;
-
-        smfn = page_to_mfn(page);
-        sl2e = l2e_from_pfn(smfn, __PAGE_HYPERVISOR | _PAGE_USER);
-        sl2e_p = (l2_pgentry_t *)map_domain_page(smfn);
-        memset(sl2e_p, 0, PAGE_SIZE);
-        unmap_domain_page(sl2e_p);
-
-        __direct_set_l2e(v, vpa, &sl2e);
-    }
-
-    __direct_get_l1e(v, vpa, &sl1e);
-
-    if ( !(l1e_get_flags(sl1e) & _PAGE_PRESENT) )
-    {
-        sl1e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR | _PAGE_USER);
-        __direct_set_l1e(v, vpa, &sl1e);
-    }
-
-    shadow_unlock(d);
-    return EXCRET_fault_fixed;
-
-nomem:
-    shadow_direct_map_clean(d);
-    domain_crash_synchronous();
-}
-#endif
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/arch/x86/shadow2-common.c b/xen/arch/x86/shadow2-common.c
new file mode 100644 (file)
index 0000000..eab6361
--- /dev/null
@@ -0,0 +1,3394 @@
+/******************************************************************************
+ * arch/x86/shadow2-common.c
+ *
+ * Shadow2 code that does not need to be multiply compiled.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#define SHADOW2 1
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <xen/irq.h>
+#include <xen/domain_page.h>
+#include <xen/guest_access.h>
+#include <asm/event.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/flushtlb.h>
+#include <asm/shadow2.h>
+#include <asm/shadow2-private.h>
+
+#if SHADOW2_AUDIT
+int shadow2_audit_enable = 0;
+#endif
+
+static void sh2_free_log_dirty_bitmap(struct domain *d);
+
+int _shadow2_mode_refcounts(struct domain *d)
+{
+    return shadow2_mode_refcounts(d);
+}
+
+
+/**************************************************************************/
+/* x86 emulator support for the shadow2 code
+ */
+
+static int
+sh2_x86_emulate_read_std(unsigned long addr,
+                         unsigned long *val,
+                         unsigned int bytes,
+                         struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *v = current;
+    if ( hvm_guest(v) )
+    {
+        *val = 0;
+        // XXX -- this is WRONG.
+        //        It entirely ignores the permissions in the page tables.
+        //        In this case, that is only a user vs supervisor access check.
+        //
+        if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) )
+        {
+#if 0
+            SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+                           v->domain->domain_id, v->vcpu_id, 
+                           addr, *val, bytes);
+#endif
+            return X86EMUL_CONTINUE;
+        }
+
+        /* If we got here, there was nothing mapped here, or a bad GFN 
+         * was mapped here.  This should never happen: we're here because
+         * of a write fault at the end of the instruction we're emulating. */ 
+        SHADOW2_PRINTK("read failed to va %#lx\n", addr);
+        return X86EMUL_PROPAGATE_FAULT;
+    }
+    else 
+    {
+        SHADOW2_PRINTK("this operation is not emulated yet\n");
+        return X86EMUL_UNHANDLEABLE;
+    }
+}
+
+static int
+sh2_x86_emulate_write_std(unsigned long addr,
+                          unsigned long val,
+                          unsigned int bytes,
+                          struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *v = current;
+#if 0
+    SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+                  v->domain->domain_id, v->vcpu_id, addr, val, bytes);
+#endif
+    if ( hvm_guest(v) )
+    {
+        // XXX -- this is WRONG.
+        //        It entirely ignores the permissions in the page tables.
+        //        In this case, that includes user vs supervisor, and
+        //        write access.
+        //
+        if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) )
+            return X86EMUL_CONTINUE;
+
+        /* If we got here, there was nothing mapped here, or a bad GFN 
+         * was mapped here.  This should never happen: we're here because
+         * of a write fault at the end of the instruction we're emulating,
+         * which should be handled by sh2_x86_emulate_write_emulated. */ 
+        SHADOW2_PRINTK("write failed to va %#lx\n", addr);
+        return X86EMUL_PROPAGATE_FAULT;
+    }
+    else 
+    {
+        SHADOW2_PRINTK("this operation is not emulated yet\n");
+        return X86EMUL_UNHANDLEABLE;
+    }
+}
+
+static int
+sh2_x86_emulate_write_emulated(unsigned long addr,
+                               unsigned long val,
+                               unsigned int bytes,
+                               struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *v = current;
+#if 0
+    SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+                  v->domain->domain_id, v->vcpu_id, addr, val, bytes);
+#endif
+    if ( hvm_guest(v) )
+    {
+        return v->arch.shadow2->x86_emulate_write(v, addr, &val, bytes, ctxt);
+    }
+    else 
+    {
+        SHADOW2_PRINTK("this operation is not emulated yet\n");
+        return X86EMUL_UNHANDLEABLE;
+    }
+}
+
+static int 
+sh2_x86_emulate_cmpxchg_emulated(unsigned long addr,
+                                 unsigned long old,
+                                 unsigned long new,
+                                 unsigned int bytes,
+                                 struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *v = current;
+#if 0
+    SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n",
+                   v->domain->domain_id, v->vcpu_id, addr, old, new, bytes);
+#endif
+    if ( hvm_guest(v) )
+    {
+        return v->arch.shadow2->x86_emulate_cmpxchg(v, addr, old, new, 
+                                                    bytes, ctxt);
+    }
+    else 
+    {
+        SHADOW2_PRINTK("this operation is not emulated yet\n");
+        return X86EMUL_UNHANDLEABLE;
+    }
+}
+
+static int 
+sh2_x86_emulate_cmpxchg8b_emulated(unsigned long addr,
+                                   unsigned long old_lo,
+                                   unsigned long old_hi,
+                                   unsigned long new_lo,
+                                   unsigned long new_hi,
+                                   struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *v = current;
+#if 0
+    SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n",
+                   v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo,
+                   new_hi, new_lo, ctxt);
+#endif
+    if ( hvm_guest(v) )
+    {
+        return v->arch.shadow2->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi,
+                                                      new_lo, new_hi, ctxt);
+    }
+    else 
+    {
+        SHADOW2_PRINTK("this operation is not emulated yet\n");
+        return X86EMUL_UNHANDLEABLE;
+    }
+}
+
+
+struct x86_emulate_ops shadow2_emulator_ops = {
+    .read_std           = sh2_x86_emulate_read_std,
+    .write_std          = sh2_x86_emulate_write_std,
+    .read_emulated      = sh2_x86_emulate_read_std,
+    .write_emulated     = sh2_x86_emulate_write_emulated,
+    .cmpxchg_emulated   = sh2_x86_emulate_cmpxchg_emulated,
+    .cmpxchg8b_emulated = sh2_x86_emulate_cmpxchg8b_emulated,
+};
+
+
+/**************************************************************************/
+/* Code for "promoting" a guest page to the point where the shadow code is
+ * willing to let it be treated as a guest page table.  This generally
+ * involves making sure there are no writable mappings available to the guest
+ * for this page.
+ */
+void shadow2_promote(struct vcpu *v, mfn_t gmfn, u32 type)
+{
+    struct page_info *page = mfn_to_page(gmfn);
+    unsigned long type_info;
+
+    ASSERT(valid_mfn(gmfn));
+
+    /* We should never try to promote a gmfn that has writeable mappings */
+    ASSERT(shadow2_remove_write_access(v, gmfn, 0, 0) == 0);
+
+    // Is the page already shadowed?
+    if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
+    {
+        // No prior shadow exists...
+
+        // Grab a type-ref.  We don't really care if we are racing with another
+        // vcpu or not, or even what kind of type we get; we just want the type
+        // count to be > 0.
+        //
+        do {
+            type_info =
+                page->u.inuse.type_info & (PGT_type_mask | PGT_va_mask);
+        } while ( !get_page_type(page, type_info) );
+
+        // Now that the type ref is non-zero, we can safely use the
+        // shadow2_flags.
+        //
+        page->shadow2_flags = 0;
+    }
+
+    ASSERT(!test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags));
+    set_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags);
+}
+
+void shadow2_demote(struct vcpu *v, mfn_t gmfn, u32 type)
+{
+    struct page_info *page = mfn_to_page(gmfn);
+
+    ASSERT(test_bit(_PGC_page_table, &page->count_info));
+    ASSERT(test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags));
+
+    clear_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags);
+
+    if ( (page->shadow2_flags & SH2F_page_type_mask) == 0 )
+    {
+        // release the extra type ref
+        put_page_type(page);
+
+        // clear the is-a-page-table bit.
+        clear_bit(_PGC_page_table, &page->count_info);
+    }
+}
+
+/**************************************************************************/
+/* Validate a pagetable change from the guest and update the shadows.
+ * Returns a bitmask of SHADOW2_SET_* flags. */
+
+static int
+__shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, 
+                               void *entry, u32 size)
+{
+    int result = 0;
+    struct page_info *page = mfn_to_page(gmfn);
+
+    sh2_mark_dirty(v->domain, gmfn);
+    
+    // Determine which types of shadows are affected, and update each.
+    //
+    // Always validate L1s before L2s to prevent another cpu with a linear
+    // mapping of this gmfn from seeing a walk that results from 
+    // using the new L2 value and the old L1 value.  (It is OK for such a
+    // guest to see a walk that uses the old L2 value with the new L1 value,
+    // as hardware could behave this way if one level of the pagewalk occurs
+    // before the store, and the next level of the pagewalk occurs after the
+    // store.
+    //
+    // Ditto for L2s before L3s, etc.
+    //
+
+    if ( !(page->count_info & PGC_page_table) )
+        return 0;  /* Not shadowed at all */
+
+#if CONFIG_PAGING_LEVELS == 2
+    if ( page->shadow2_flags & SH2F_L1_32 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 2, 2)
+            (v, gmfn, entry, size);
+#else 
+    if ( page->shadow2_flags & SH2F_L1_32 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 2)
+            (v, gmfn, entry, size);
+#endif
+
+#if CONFIG_PAGING_LEVELS == 2
+    if ( page->shadow2_flags & SH2F_L2_32 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 2, 2)
+            (v, gmfn, entry, size);
+#else 
+    if ( page->shadow2_flags & SH2F_L2_32 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 2)
+            (v, gmfn, entry, size);
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 3 
+    if ( page->shadow2_flags & SH2F_L1_PAE ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 3)
+            (v, gmfn, entry, size);
+    if ( page->shadow2_flags & SH2F_L2_PAE ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 3)
+            (v, gmfn, entry, size);
+    if ( page->shadow2_flags & SH2F_L2H_PAE ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2he, 3, 3)
+            (v, gmfn, entry, size);
+    if ( page->shadow2_flags & SH2F_L3_PAE ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 3, 3)
+            (v, gmfn, entry, size);
+#else /* 32-bit non-PAE hypervisor does not support PAE guests */
+    ASSERT((page->shadow2_flags & (SH2F_L3_PAE|SH2F_L2_PAE|SH2F_L1_PAE)) == 0);
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 4 
+    if ( page->shadow2_flags & SH2F_L1_64 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 4, 4)
+            (v, gmfn, entry, size);
+    if ( page->shadow2_flags & SH2F_L2_64 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 4, 4)
+            (v, gmfn, entry, size);
+    if ( page->shadow2_flags & SH2F_L3_64 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 4, 4)
+            (v, gmfn, entry, size);
+    if ( page->shadow2_flags & SH2F_L4_64 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl4e, 4, 4)
+            (v, gmfn, entry, size);
+#else /* 32-bit/PAE hypervisor does not support 64-bit guests */
+    ASSERT((page->shadow2_flags 
+            & (SH2F_L4_64|SH2F_L3_64|SH2F_L2_64|SH2F_L1_64)) == 0);
+#endif
+
+    return result;
+}
+
+
+int
+shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry)
+/* This is the entry point from hypercalls. It returns a bitmask of all the 
+ * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */
+{
+    int rc;
+
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+    rc = __shadow2_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t));
+    shadow2_audit_tables(v);
+    return rc;
+}
+
+void
+shadow2_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
+                                void *entry, u32 size)
+/* This is the entry point for emulated writes to pagetables in HVM guests */
+{
+    struct domain *d = v->domain;
+    int rc;
+
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+    rc = __shadow2_validate_guest_entry(v, gmfn, entry, size);
+    if ( rc & SHADOW2_SET_FLUSH )
+    {
+        // Flush everyone except the local processor, which will flush when it
+        // re-enters the HVM guest.
+        //
+        cpumask_t mask = d->domain_dirty_cpumask;
+        cpu_clear(v->processor, mask);
+        flush_tlb_mask(mask);
+    }
+    if ( rc & SHADOW2_SET_ERROR ) 
+    {
+        /* This page is probably not a pagetable any more: tear it out of the 
+         * shadows, along with any tables that reference it */
+        shadow2_remove_all_shadows_and_parents(v, gmfn);
+    }
+    /* We ignore the other bits: since we are about to change CR3 on
+     * VMENTER we don't need to do any extra TLB flushes. */ 
+}
+
+
+/**************************************************************************/
+/* Memory management for shadow pages. */ 
+
+/* Meaning of the count_info field in shadow pages
+ * ----------------------------------------------
+ * 
+ * A count of all references to this page from other shadow pages and
+ * guest CR3s (a.k.a. v->arch.shadow_table).  
+ *
+ * The top bits hold the shadow type and the pinned bit.  Top-level
+ * shadows are pinned so that they don't disappear when not in a CR3
+ * somewhere.
+ *
+ * We don't need to use get|put_page for this as the updates are all
+ * protected by the shadow lock.  We can't use get|put_page for this
+ * as the size of the count on shadow pages is different from that on
+ * normal guest pages.
+ */
+
+/* Meaning of the type_info field in shadow pages
+ * ----------------------------------------------
+ * 
+ * type_info use depends on the shadow type (from count_info)
+ * 
+ * PGC_SH2_none : This page is in the shadow2 free pool.  type_info holds
+ *                the chunk order for our freelist allocator.
+ *
+ * PGC_SH2_l*_shadow : This page is in use as a shadow. type_info 
+ *                     holds the mfn of the guest page being shadowed,
+ *
+ * PGC_SH2_fl1_*_shadow : This page is being used to shatter a superpage.
+ *                        type_info holds the gfn being shattered.
+ *
+ * PGC_SH2_monitor_table : This page is part of a monitor table.
+ *                         type_info is not used.
+ */
+
+/* Meaning of the _domain field in shadow pages
+ * --------------------------------------------
+ *
+ * In shadow pages, this field will always have its least significant bit
+ * set.  This ensures that all attempts to get_page() will fail (as all
+ * valid pickled domain pointers have a zero for their least significant bit).
+ * Instead, the remaining upper bits are used to record the shadow generation
+ * counter when the shadow was created.
+ */
+
+/* Meaning of the shadow2_flags field
+ * ----------------------------------
+ * 
+ * In guest pages that are shadowed, one bit for each kind of shadow they have.
+ * 
+ * In shadow pages, will be used for holding a representation of the populated
+ * entries in this shadow (either a min/max, or a bitmap, or ...)
+ *
+ * In monitor-table pages, holds the level of the particular page (to save
+ * spilling the shadow types into an extra bit by having three types of monitor
+ * page).
+ */
+
+/* Meaning of the list_head struct in shadow pages
+ * -----------------------------------------------
+ *
+ * In free shadow pages, this is used to hold the free-lists of chunks.
+ *
+ * In top-level shadow tables, this holds a linked-list of all top-level
+ * shadows (used for recovering memory and destroying shadows). 
+ *
+ * In lower-level shadows, this holds the physical address of a higher-level
+ * shadow entry that holds a reference to this shadow (or zero).
+ */
+
+/* Allocating shadow pages
+ * -----------------------
+ *
+ * Most shadow pages are allocated singly, but there are two cases where we 
+ * need to allocate multiple pages together.
+ * 
+ * 1: Shadowing 32-bit guest tables on PAE or 64-bit shadows.
+ *    A 32-bit guest l1 table covers 4MB of virtuial address space,
+ *    and needs to be shadowed by two PAE/64-bit l1 tables (covering 2MB
+ *    of virtual address space each).  Similarly, a 32-bit guest l2 table 
+ *    (4GB va) needs to be shadowed by four PAE/64-bit l2 tables (1GB va 
+ *    each).  These multi-page shadows are contiguous and aligned; 
+ *    functions for handling offsets into them are defined in shadow2.c 
+ *    (shadow_l1_index() etc.)
+ *    
+ * 2: Shadowing PAE top-level pages.  Each guest page that contains
+ *    any PAE top-level pages requires two shadow pages to shadow it.
+ *    They contain alternating l3 tables and pae_l3_bookkeeping structs.
+ *
+ * This table shows the allocation behaviour of the different modes:
+ *
+ * Xen paging      32b  pae  pae  64b  64b  64b
+ * Guest paging    32b  32b  pae  32b  pae  64b
+ * PV or HVM        *   HVM   *   HVM  HVM   * 
+ * Shadow paging   32b  pae  pae  pae  pae  64b
+ *
+ * sl1 size         4k   8k   4k   8k   4k   4k
+ * sl2 size         4k  16k   4k  16k   4k   4k
+ * sl3 size         -    -    8k   -    8k   4k
+ * sl4 size         -    -    -    -    -    4k
+ *
+ * We allocate memory from xen in four-page units and break them down
+ * with a simple buddy allocator.  Can't use the xen allocator to handle
+ * this as it only works for contiguous zones, and a domain's shadow
+ * pool is made of fragments.
+ *
+ * In HVM guests, the p2m table is built out of shadow pages, and we provide 
+ * a function for the p2m management to steal pages, in max-order chunks, from 
+ * the free pool.  We don't provide for giving them back, yet.
+ */
+
+/* Figure out the least acceptable quantity of shadow memory.
+ * The minimum memory requirement for always being able to free up a
+ * chunk of memory is very small -- only three max-order chunks per
+ * vcpu to hold the top level shadows and pages with Xen mappings in them.  
+ *
+ * But for a guest to be guaranteed to successfully execute a single
+ * instruction, we must be able to map a large number (about thirty) VAs
+ * at the same time, which means that to guarantee progress, we must
+ * allow for more than ninety allocated pages per vcpu.  We round that
+ * up to 128 pages, or half a megabyte per vcpu. */
+unsigned int shadow2_min_acceptable_pages(struct domain *d) 
+{
+    u32 vcpu_count = 0;
+    struct vcpu *v;
+
+    for_each_vcpu(d, v)
+        vcpu_count++;
+
+    return (vcpu_count * 128);
+}
+
+/* Using the type_info field to store freelist order */
+#define SH2_PFN_ORDER(_p) ((_p)->u.inuse.type_info)
+#define SH2_SET_PFN_ORDER(_p, _o)                       \
+ do { (_p)->u.inuse.type_info = (_o); } while (0)
+
+/* Figure out the order of allocation needed for a given shadow type */
+static inline u32
+shadow_order(u32 shadow_type) 
+{
+#if CONFIG_PAGING_LEVELS > 2
+    static const u32 type_to_order[16] = {
+        0, /* PGC_SH2_none           */
+        1, /* PGC_SH2_l1_32_shadow   */
+        1, /* PGC_SH2_fl1_32_shadow  */
+        2, /* PGC_SH2_l2_32_shadow   */
+        0, /* PGC_SH2_l1_pae_shadow  */
+        0, /* PGC_SH2_fl1_pae_shadow */
+        0, /* PGC_SH2_l2_pae_shadow  */
+        0, /* PGC_SH2_l2h_pae_shadow */
+        1, /* PGC_SH2_l3_pae_shadow  */
+        0, /* PGC_SH2_l1_64_shadow   */
+        0, /* PGC_SH2_fl1_64_shadow  */
+        0, /* PGC_SH2_l2_64_shadow   */
+        0, /* PGC_SH2_l3_64_shadow   */
+        0, /* PGC_SH2_l4_64_shadow   */
+        2, /* PGC_SH2_p2m_table      */
+        0  /* PGC_SH2_monitor_table  */
+        };
+    u32 type = (shadow_type & PGC_SH2_type_mask) >> PGC_SH2_type_shift;
+    return type_to_order[type];
+#else  /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
+    return 0;
+#endif
+}
+
+
+/* Do we have a free chunk of at least this order? */
+static inline int chunk_is_available(struct domain *d, int order)
+{
+    int i;
+    
+    for ( i = order; i <= SHADOW2_MAX_ORDER; i++ )
+        if ( !list_empty(&d->arch.shadow2_freelists[i]) )
+            return 1;
+    return 0;
+}
+
+/* Dispatcher function: call the per-mode function that will unhook the
+ * non-Xen mappings in this top-level shadow mfn */
+void shadow2_unhook_mappings(struct vcpu *v, mfn_t smfn)
+{
+    struct page_info *pg = mfn_to_page(smfn);
+    switch ( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift )
+    {
+    case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
+#if CONFIG_PAGING_LEVELS == 2
+        SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,2,2)(v,smfn);
+#else
+        SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,3,2)(v,smfn);
+#endif
+        break;
+#if CONFIG_PAGING_LEVELS >= 3
+    case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_unhook_pae_mappings,3,3)(v,smfn);
+        break;
+#endif
+#if CONFIG_PAGING_LEVELS >= 4
+    case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_unhook_64b_mappings,4,4)(v,smfn);
+        break;
+#endif
+    default:
+        SHADOW2_PRINTK("top-level shadow has bad type %08lx\n", 
+                       (unsigned long)((pg->count_info & PGC_SH2_type_mask)
+                                       >> PGC_SH2_type_shift));
+        BUG();
+    }
+}
+
+
+/* Make sure there is at least one chunk of the required order available
+ * in the shadow page pool. This must be called before any calls to
+ * shadow2_alloc().  Since this will free existing shadows to make room,
+ * it must be called early enough to avoid freeing shadows that the
+ * caller is currently working on. */
+void shadow2_prealloc(struct domain *d, unsigned int order)
+{
+    /* Need a vpcu for calling unpins; for now, since we don't have
+     * per-vcpu shadows, any will do */
+    struct vcpu *v = d->vcpu[0];
+    struct list_head *l, *t;
+    struct page_info *pg;
+    mfn_t smfn;
+
+    if ( chunk_is_available(d, order) ) return; 
+    
+    /* Stage one: walk the list of top-level pages, unpinning them */
+    perfc_incrc(shadow2_prealloc_1);
+    list_for_each_backwards_safe(l, t, &d->arch.shadow2_toplevel_shadows)
+    {
+        pg = list_entry(l, struct page_info, list);
+        smfn = page_to_mfn(pg);
+
+#if CONFIG_PAGING_LEVELS >= 3
+        if ( (pg->count_info & PGC_SH2_type_mask) == PGC_SH2_l3_pae_shadow )
+        {
+            /* For PAE, we need to unpin each subshadow on this shadow */
+            SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn);
+        } 
+        else 
+#endif /* 32-bit code always takes this branch */
+        {
+            /* Unpin this top-level shadow */
+            sh2_unpin(v, smfn);
+        }
+
+        /* See if that freed up a chunk of appropriate size */
+        if ( chunk_is_available(d, order) ) return;
+    }
+
+    /* Stage two: all shadow pages are in use in hierarchies that are
+     * loaded in cr3 on some vcpu.  Walk them, unhooking the non-Xen
+     * mappings. */
+    perfc_incrc(shadow2_prealloc_2);
+    v = current;
+    if ( v->domain != d )
+        v = d->vcpu[0];
+    /* Walk the list from the tail: recently used toplevels have been pulled
+     * to the head */
+    list_for_each_backwards_safe(l, t, &d->arch.shadow2_toplevel_shadows)
+    {
+        pg = list_entry(l, struct page_info, list);
+        smfn = page_to_mfn(pg);
+        shadow2_unhook_mappings(v, smfn);
+
+        /* Need to flush TLB if we've altered our own tables */
+        if ( !shadow2_mode_external(d) 
+             && pagetable_get_pfn(current->arch.shadow_table) == mfn_x(smfn) )
+            local_flush_tlb();
+        
+        /* See if that freed up a chunk of appropriate size */
+        if ( chunk_is_available(d, order) ) return;
+    }
+    
+    /* Nothing more we can do: all remaining shadows are of pages that
+     * hold Xen mappings for some vcpu.  This can never happen. */
+    SHADOW2_PRINTK("Can't pre-allocate %i shadow pages!\n"
+                   "  shadow pages total = %u, free = %u, p2m=%u\n",
+                   1 << order, 
+                   d->arch.shadow2_total_pages, 
+                   d->arch.shadow2_free_pages, 
+                   d->arch.shadow2_p2m_pages);
+    BUG();
+}
+
+
+/* Allocate another shadow's worth of (contiguous, aligned) pages,
+ * and fill in the type and backpointer fields of their page_infos. 
+ * Never fails to allocate. */
+mfn_t shadow2_alloc(struct domain *d,  
+                    u32 shadow_type,
+                    unsigned long backpointer)
+{
+    struct page_info *pg = NULL;
+    unsigned int order = shadow_order(shadow_type);
+    cpumask_t mask;
+    void *p;
+    int i;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(order <= SHADOW2_MAX_ORDER);
+    ASSERT(shadow_type != PGC_SH2_none);
+    perfc_incrc(shadow2_alloc);
+
+    /* Find smallest order which can satisfy the request. */
+    for ( i = order; i <= SHADOW2_MAX_ORDER; i++ )
+        if ( !list_empty(&d->arch.shadow2_freelists[i]) )
+        {
+            pg = list_entry(d->arch.shadow2_freelists[i].next, 
+                            struct page_info, list);
+            list_del(&pg->list);
+            
+            /* We may have to halve the chunk a number of times. */
+            while ( i != order )
+            {
+                i--;
+                SH2_SET_PFN_ORDER(pg, i);
+                list_add_tail(&pg->list, &d->arch.shadow2_freelists[i]);
+                pg += 1 << i;
+            }
+            d->arch.shadow2_free_pages -= 1 << order;
+
+            /* Init page info fields and clear the pages */
+            for ( i = 0; i < 1<<order ; i++ ) 
+            {
+                pg[i].u.inuse.type_info = backpointer;
+                pg[i].count_info = shadow_type;
+                pg[i].shadow2_flags = 0;
+                INIT_LIST_HEAD(&pg[i].list);
+                /* Before we overwrite the old contents of this page, 
+                 * we need to be sure that no TLB holds a pointer to it. */
+                mask = d->domain_dirty_cpumask;
+                tlbflush_filter(mask, pg[i].tlbflush_timestamp);
+                if ( unlikely(!cpus_empty(mask)) )
+                {
+                    perfc_incrc(shadow2_alloc_tlbflush);
+                    flush_tlb_mask(mask);
+                }
+                /* Now safe to clear the page for reuse */
+                p = sh2_map_domain_page(page_to_mfn(pg+i));
+                ASSERT(p != NULL);
+                clear_page(p);
+                sh2_unmap_domain_page(p);
+                perfc_incr(shadow2_alloc_count);
+            }
+            return page_to_mfn(pg);
+        }
+    
+    /* If we get here, we failed to allocate. This should never happen.
+     * It means that we didn't call shadow2_prealloc() correctly before
+     * we allocated.  We can't recover by calling prealloc here, because
+     * we might free up higher-level pages that the caller is working on. */
+    SHADOW2_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
+    BUG();
+}
+
+
+/* Return some shadow pages to the pool. */
+void shadow2_free(struct domain *d, mfn_t smfn)
+{
+    struct page_info *pg = mfn_to_page(smfn); 
+    u32 shadow_type;
+    unsigned long order;
+    unsigned long mask;
+    int i;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    perfc_incrc(shadow2_free);
+
+    shadow_type = pg->count_info & PGC_SH2_type_mask;
+    ASSERT(shadow_type != PGC_SH2_none);
+    ASSERT(shadow_type != PGC_SH2_p2m_table);
+    order = shadow_order(shadow_type);
+
+    d->arch.shadow2_free_pages += 1 << order;
+
+    for ( i = 0; i < 1<<order; i++ ) 
+    {
+        /* Strip out the type: this is now a free shadow page */
+        pg[i].count_info = 0;
+        /* Remember the TLB timestamp so we will know whether to flush 
+         * TLBs when we reuse the page.  Because the destructors leave the
+         * contents of the pages in place, we can delay TLB flushes until
+         * just before the allocator hands the page out again. */
+        pg[i].tlbflush_timestamp = tlbflush_current_time();
+        perfc_decr(shadow2_alloc_count);
+    }
+
+    /* Merge chunks as far as possible. */
+    while ( order < SHADOW2_MAX_ORDER )
+    {
+        mask = 1 << order;
+        if ( (mfn_x(page_to_mfn(pg)) & mask) ) {
+            /* Merge with predecessor block? */
+            if ( (((pg-mask)->count_info & PGC_SH2_type_mask) != PGT_none) 
+                 || (SH2_PFN_ORDER(pg-mask) != order) )
+                break;
+            list_del(&(pg-mask)->list);
+            pg -= mask;
+        } else {
+            /* Merge with successor block? */
+            if ( (((pg+mask)->count_info & PGC_SH2_type_mask) != PGT_none)
+                 || (SH2_PFN_ORDER(pg+mask) != order) )
+                break;
+            list_del(&(pg+mask)->list);
+        }
+        order++;
+    }
+
+    SH2_SET_PFN_ORDER(pg, order);
+    list_add_tail(&pg->list, &d->arch.shadow2_freelists[order]);
+}
+
+/* Divert some memory from the pool to be used by the p2m mapping.
+ * This action is irreversible: the p2m mapping only ever grows.
+ * That's OK because the p2m table only exists for external domains,
+ * and those domains can't ever turn off shadow mode.
+ * Also, we only ever allocate a max-order chunk, so as to preserve
+ * the invariant that shadow2_prealloc() always works.
+ * Returns 0 iff it can't get a chunk (the caller should then
+ * free up some pages in domheap and call set_sh2_allocation);
+ * returns non-zero on success.
+ */
+static int
+shadow2_alloc_p2m_pages(struct domain *d)
+{
+    struct page_info *pg;
+    u32 i;
+    ASSERT(shadow2_lock_is_acquired(d));
+    
+    if ( d->arch.shadow2_total_pages 
+         < (shadow2_min_acceptable_pages(d) + (1<<SHADOW2_MAX_ORDER)) )
+        return 0; /* Not enough shadow memory: need to increase it first */
+    
+    pg = mfn_to_page(shadow2_alloc(d, PGC_SH2_p2m_table, 0));
+    d->arch.shadow2_p2m_pages += (1<<SHADOW2_MAX_ORDER);
+    d->arch.shadow2_total_pages -= (1<<SHADOW2_MAX_ORDER);
+    for (i = 0; i < (1<<SHADOW2_MAX_ORDER); i++)
+    {
+        /* Unlike shadow pages, mark p2m pages as owned by the domain */
+        page_set_owner(&pg[i], d);
+        list_add_tail(&pg[i].list, &d->arch.shadow2_p2m_freelist);
+    }
+    return 1;
+}
+
+// Returns 0 if no memory is available...
+mfn_t
+shadow2_alloc_p2m_page(struct domain *d)
+{
+    struct list_head *entry;
+    mfn_t mfn;
+    void *p;
+
+    if ( list_empty(&d->arch.shadow2_p2m_freelist) &&
+         !shadow2_alloc_p2m_pages(d) )
+        return _mfn(0);
+    entry = d->arch.shadow2_p2m_freelist.next;
+    list_del(entry);
+    list_add_tail(entry, &d->arch.shadow2_p2m_inuse);
+    mfn = page_to_mfn(list_entry(entry, struct page_info, list));
+    sh2_get_ref(mfn, 0);
+    p = sh2_map_domain_page(mfn);
+    clear_page(p);
+    sh2_unmap_domain_page(p);
+
+    return mfn;
+}
+
+#if CONFIG_PAGING_LEVELS == 3
+static void p2m_install_entry_in_monitors(struct domain *d, 
+                                          l3_pgentry_t *l3e) 
+/* Special case, only used for external-mode domains on PAE hosts:
+ * update the mapping of the p2m table.  Once again, this is trivial in
+ * other paging modes (one top-level entry points to the top-level p2m,
+ * no maintenance needed), but PAE makes life difficult by needing a
+ * copy the eight l3es of the p2m table in eight l2h slots in the
+ * monitor table.  This function makes fresh copies when a p2m l3e
+ * changes. */
+{
+    l2_pgentry_t *ml2e;
+    struct vcpu *v;
+    unsigned int index;
+
+    index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
+    ASSERT(index < MACHPHYS_MBYTES>>1);
+
+    for_each_vcpu(d, v) 
+    {
+        if ( pagetable_get_pfn(v->arch.monitor_table) == 0 ) 
+            continue;
+        ASSERT(shadow2_mode_external(v->domain));
+
+        SHADOW2_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
+                      d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
+
+        if ( v == current ) /* OK to use linear map of monitor_table */
+            ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
+        else 
+        {
+            l3_pgentry_t *ml3e;
+            ml3e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+            ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
+            ml2e = sh2_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
+            ml2e += l2_table_offset(RO_MPT_VIRT_START);
+            sh2_unmap_domain_page(ml3e);
+        }
+        ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
+        if ( v != current )
+            sh2_unmap_domain_page(ml2e);
+    }
+}
+#endif
+
+// Find the next level's P2M entry, checking for out-of-range gfn's...
+// Returns NULL on error.
+//
+static l1_pgentry_t *
+p2m_find_entry(void *table, unsigned long *gfn_remainder,
+                   unsigned long gfn, u32 shift, u32 max)
+{
+    u32 index;
+
+    index = *gfn_remainder >> shift;
+    if ( index >= max )
+    {
+        SHADOW2_DEBUG(P2M, "gfn=0x%lx out of range "
+                      "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
+                       gfn, *gfn_remainder, shift, index, max);
+        return NULL;
+    }
+    *gfn_remainder &= (1 << shift) - 1;
+    return (l1_pgentry_t *)table + index;
+}
+
+// Walk one level of the P2M table, allocating a new table if required.
+// Returns 0 on error.
+//
+static int
+p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table, 
+               unsigned long *gfn_remainder, unsigned long gfn, u32 shift, 
+               u32 max, unsigned long type)
+{
+    l1_pgentry_t *p2m_entry;
+    void *next;
+
+    if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
+                                      shift, max)) )
+        return 0;
+
+    if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
+    {
+        mfn_t mfn = shadow2_alloc_p2m_page(d);
+        if ( mfn_x(mfn) == 0 )
+            return 0;
+        *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
+        mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
+        mfn_to_page(mfn)->count_info = 1;
+#if CONFIG_PAGING_LEVELS == 3
+        if (type == PGT_l2_page_table)
+        {
+            /* We have written to the p2m l3: need to sync the per-vcpu
+             * copies of it in the monitor tables */
+            p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
+        }
+#endif
+        /* The P2M can be shadowed: keep the shadows synced */
+        if ( d->vcpu[0] )
+            (void)__shadow2_validate_guest_entry(d->vcpu[0], *table_mfn,
+                                                 p2m_entry, sizeof *p2m_entry);
+    }
+    *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
+    next = sh2_map_domain_page(*table_mfn);
+    sh2_unmap_domain_page(*table);
+    *table = next;
+
+    return 1;
+}
+
+// Returns 0 on error (out of memory)
+int
+shadow2_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+{
+    // XXX -- this might be able to be faster iff current->domain == d
+    mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
+    void *table = sh2_map_domain_page(table_mfn);
+    unsigned long gfn_remainder = gfn;
+    l1_pgentry_t *p2m_entry;
+
+#if CONFIG_PAGING_LEVELS >= 4
+    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+                         L4_PAGETABLE_SHIFT - PAGE_SHIFT,
+                         L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
+        return 0;
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+    // When using PAE Xen, we only allow 33 bits of pseudo-physical
+    // address in translated guests (i.e. 8 GBytes).  This restriction
+    // comes from wanting to map the P2M table into the 16MB RO_MPT hole
+    // in Xen's address space for translated PV guests.
+    //
+    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+                         L3_PAGETABLE_SHIFT - PAGE_SHIFT,
+                         (CONFIG_PAGING_LEVELS == 3
+                          ? 8
+                          : L3_PAGETABLE_ENTRIES),
+                         PGT_l2_page_table) )
+        return 0;
+#endif
+    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+                         L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                         L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
+        return 0;
+
+    p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                               0, L1_PAGETABLE_ENTRIES);
+    ASSERT(p2m_entry);
+    if ( valid_mfn(mfn) )
+        *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
+    else
+        *p2m_entry = l1e_empty();
+
+    /* The P2M can be shadowed: keep the shadows synced */
+    (void) __shadow2_validate_guest_entry(d->vcpu[0], table_mfn, 
+                                          p2m_entry, sizeof *p2m_entry);
+
+    sh2_unmap_domain_page(table);
+
+    return 1;
+}
+
+// Allocate a new p2m table for a domain.
+//
+// The structure of the p2m table is that of a pagetable for xen (i.e. it is
+// controlled by CONFIG_PAGING_LEVELS).
+//
+// Returns 0 if p2m table could not be initialized
+//
+static int
+shadow2_alloc_p2m_table(struct domain *d)
+{
+    mfn_t p2m_top;
+    struct list_head *entry;
+    unsigned int page_count = 0;
+    
+    SHADOW2_PRINTK("allocating p2m table\n");
+    ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
+
+    p2m_top = shadow2_alloc_p2m_page(d);
+    mfn_to_page(p2m_top)->count_info = 1;
+    mfn_to_page(p2m_top)->u.inuse.type_info = 
+#if CONFIG_PAGING_LEVELS == 4
+        PGT_l4_page_table
+#elif CONFIG_PAGING_LEVELS == 3
+        PGT_l3_page_table
+#elif CONFIG_PAGING_LEVELS == 2
+        PGT_l2_page_table
+#endif
+        | 1 | PGT_validated;
+   
+    if ( mfn_x(p2m_top) == 0 )
+        return 0;
+
+    d->arch.phys_table = pagetable_from_mfn(p2m_top);
+
+    SHADOW2_PRINTK("populating p2m table\n");
+    for ( entry = d->page_list.next;
+          entry != &d->page_list;
+          entry = entry->next )
+    {
+        struct page_info *page = list_entry(entry, struct page_info, list);
+        mfn_t mfn = page_to_mfn(page);
+        unsigned long gfn = get_gpfn_from_mfn(mfn_x(mfn));
+        page_count++;
+        if (
+#ifdef __x86_64__
+            (gfn != 0x5555555555555555L)
+#else
+            (gfn != 0x55555555L)
+#endif
+             && gfn != INVALID_M2P_ENTRY
+             && !shadow2_set_p2m_entry(d, gfn, mfn) )
+        {
+            SHADOW2_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" SH2_PRI_mfn "\n",
+                           gfn, mfn_x(mfn));
+            return 0;
+        }
+    }
+
+    SHADOW2_PRINTK("p2m table initialised (%u pages)\n", page_count);
+    return 1;
+}
+
+mfn_t
+sh2_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
+/* Read another domain's p2m entries */
+{
+    mfn_t mfn;
+    unsigned long addr = gpfn << PAGE_SHIFT;
+    l2_pgentry_t *l2e;
+    l1_pgentry_t *l1e;
+    
+    ASSERT(shadow2_mode_translate(d));
+    mfn = pagetable_get_mfn(d->arch.phys_table);
+
+
+#if CONFIG_PAGING_LEVELS > 2
+    if ( gpfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) ) 
+        /* This pfn is higher than the p2m map can hold */
+        return _mfn(INVALID_MFN);
+#endif
+
+
+#if CONFIG_PAGING_LEVELS >= 4
+    { 
+        l4_pgentry_t *l4e = sh2_map_domain_page(mfn);
+        l4e += l4_table_offset(addr);
+        if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
+        {
+            sh2_unmap_domain_page(l4e);
+            return _mfn(INVALID_MFN);
+        }
+        mfn = _mfn(l4e_get_pfn(*l4e));
+        sh2_unmap_domain_page(l4e);
+    }
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+    {
+        l3_pgentry_t *l3e = sh2_map_domain_page(mfn);
+        l3e += l3_table_offset(addr);
+        if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
+        {
+            sh2_unmap_domain_page(l3e);
+            return _mfn(INVALID_MFN);
+        }
+        mfn = _mfn(l3e_get_pfn(*l3e));
+        sh2_unmap_domain_page(l3e);
+    }
+#endif
+
+    l2e = sh2_map_domain_page(mfn);
+    l2e += l2_table_offset(addr);
+    if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
+    {
+        sh2_unmap_domain_page(l2e);
+        return _mfn(INVALID_MFN);
+    }
+    mfn = _mfn(l2e_get_pfn(*l2e));
+    sh2_unmap_domain_page(l2e);
+
+    l1e = sh2_map_domain_page(mfn);
+    l1e += l1_table_offset(addr);
+    if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
+    {
+        sh2_unmap_domain_page(l1e);
+        return _mfn(INVALID_MFN);
+    }
+    mfn = _mfn(l1e_get_pfn(*l1e));
+    sh2_unmap_domain_page(l1e);
+
+    return mfn;
+}
+
+unsigned long
+shadow2_gfn_to_mfn_foreign(unsigned long gpfn)
+{
+    return mfn_x(sh2_gfn_to_mfn_foreign(current->domain, gpfn));
+}
+
+
+static void shadow2_p2m_teardown(struct domain *d)
+/* Return all the p2m pages to Xen.
+ * We know we don't have any extra mappings to these pages */
+{
+    struct list_head *entry, *n;
+    struct page_info *pg;
+
+    d->arch.phys_table = pagetable_null();
+
+    list_for_each_safe(entry, n, &d->arch.shadow2_p2m_inuse)
+    {
+        pg = list_entry(entry, struct page_info, list);
+        list_del(entry);
+        /* Should have just the one ref we gave it in alloc_p2m_page() */
+        if ( (pg->count_info & PGC_SH2_count_mask) != 1 )
+        {
+            SHADOW2_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
+                           pg->count_info, pg->u.inuse.type_info);
+        }
+        ASSERT(page_get_owner(pg) == d);
+        /* Free should not decrement domain's total allocation, since 
+         * these pages were allocated without an owner. */
+        page_set_owner(pg, NULL); 
+        free_domheap_pages(pg, 0);
+        d->arch.shadow2_p2m_pages--;
+        perfc_decr(shadow2_alloc_count);
+    }
+    list_for_each_safe(entry, n, &d->arch.shadow2_p2m_freelist)
+    {
+        list_del(entry);
+        pg = list_entry(entry, struct page_info, list);
+        ASSERT(page_get_owner(pg) == d);
+        /* Free should not decrement domain's total allocation. */
+        page_set_owner(pg, NULL); 
+        free_domheap_pages(pg, 0);
+        d->arch.shadow2_p2m_pages--;
+        perfc_decr(shadow2_alloc_count);
+    }
+    ASSERT(d->arch.shadow2_p2m_pages == 0);
+}
+
+/* Set the pool of shadow pages to the required number of pages.
+ * Input will be rounded up to at least shadow2_min_acceptable_pages(),
+ * plus space for the p2m table.
+ * Returns 0 for success, non-zero for failure. */
+static unsigned int set_sh2_allocation(struct domain *d, 
+                                       unsigned int pages,
+                                       int *preempted)
+{
+    struct page_info *pg;
+    unsigned int lower_bound;
+    int j;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    
+    /* Don't allocate less than the minimum acceptable, plus one page per
+     * megabyte of RAM (for the p2m table) */
+    lower_bound = shadow2_min_acceptable_pages(d) + (d->tot_pages / 256);
+    if ( pages > 0 && pages < lower_bound )
+        pages = lower_bound;
+    /* Round up to largest block size */
+    pages = (pages + ((1<<SHADOW2_MAX_ORDER)-1)) & ~((1<<SHADOW2_MAX_ORDER)-1);
+
+    SHADOW2_PRINTK("current %i target %i\n", 
+                   d->arch.shadow2_total_pages, pages);
+
+    while ( d->arch.shadow2_total_pages != pages ) 
+    {
+        if ( d->arch.shadow2_total_pages < pages ) 
+        {
+            /* Need to allocate more memory from domheap */
+            pg = alloc_domheap_pages(NULL, SHADOW2_MAX_ORDER, 0); 
+            if ( pg == NULL ) 
+            { 
+                SHADOW2_PRINTK("failed to allocate shadow pages.\n");
+                return -ENOMEM;
+            }
+            d->arch.shadow2_free_pages += 1<<SHADOW2_MAX_ORDER;
+            d->arch.shadow2_total_pages += 1<<SHADOW2_MAX_ORDER;
+            for ( j = 0; j < 1<<SHADOW2_MAX_ORDER; j++ ) 
+            {
+                pg[j].u.inuse.type_info = 0;  /* Free page */
+                pg[j].tlbflush_timestamp = 0; /* Not in any TLB */
+            }
+            SH2_SET_PFN_ORDER(pg, SHADOW2_MAX_ORDER);
+            list_add_tail(&pg->list, 
+                          &d->arch.shadow2_freelists[SHADOW2_MAX_ORDER]);
+        } 
+        else if ( d->arch.shadow2_total_pages > pages ) 
+        {
+            /* Need to return memory to domheap */
+            shadow2_prealloc(d, SHADOW2_MAX_ORDER);
+            ASSERT(!list_empty(&d->arch.shadow2_freelists[SHADOW2_MAX_ORDER]));
+            pg = list_entry(d->arch.shadow2_freelists[SHADOW2_MAX_ORDER].next, 
+                            struct page_info, list);
+            list_del(&pg->list);
+            d->arch.shadow2_free_pages -= 1<<SHADOW2_MAX_ORDER;
+            d->arch.shadow2_total_pages -= 1<<SHADOW2_MAX_ORDER;
+            free_domheap_pages(pg, SHADOW2_MAX_ORDER);
+        }
+
+        /* Check to see if we need to yield and try again */
+        if ( preempted && hypercall_preempt_check() )
+        {
+            *preempted = 1;
+            return 0;
+        }
+    }
+
+    return 0;
+}
+
+unsigned int shadow2_set_allocation(struct domain *d, 
+                                    unsigned int megabytes,
+                                    int *preempted)
+/* Hypercall interface to set the shadow memory allocation */
+{
+    unsigned int rv;
+    shadow2_lock(d);
+    rv = set_sh2_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted); 
+    SHADOW2_PRINTK("dom %u allocation now %u pages (%u MB)\n",
+                   d->domain_id,
+                   d->arch.shadow2_total_pages,
+                   shadow2_get_allocation(d));
+    shadow2_unlock(d);
+    return rv;
+}
+
+/**************************************************************************/
+/* Hash table for storing the guest->shadow mappings */
+
+/* Hash function that takes a gfn or mfn, plus another byte of type info */
+typedef u32 key_t;
+static inline key_t sh2_hash(unsigned long n, u8 t) 
+{
+    unsigned char *p = (unsigned char *)&n;
+    key_t k = t;
+    int i;
+    for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
+    return k;
+}
+
+#if SHADOW2_AUDIT & (SHADOW2_AUDIT_HASH|SHADOW2_AUDIT_HASH_FULL)
+
+/* Before we get to the mechanism, define a pair of audit functions
+ * that sanity-check the contents of the hash table. */
+static void sh2_hash_audit_bucket(struct domain *d, int bucket)
+/* Audit one bucket of the hash table */
+{
+    struct shadow2_hash_entry *e, *x;
+    struct page_info *pg;
+
+    if ( !(SHADOW2_AUDIT_ENABLE) )
+        return;
+
+    e = &d->arch.shadow2_hash_table[bucket];
+    if ( e->t == 0 ) return; /* Bucket is empty */ 
+    while ( e )
+    {
+        /* Empty link? */
+        BUG_ON( e->t == 0 ); 
+        /* Bogus type? */
+        BUG_ON( e->t > (PGC_SH2_max_shadow >> PGC_SH2_type_shift) );
+        /* Wrong bucket? */
+        BUG_ON( sh2_hash(e->n, e->t) % SHADOW2_HASH_BUCKETS != bucket ); 
+        /* Duplicate entry? */
+        for ( x = e->next; x; x = x->next )
+            BUG_ON( x->n == e->n && x->t == e->t );
+        /* Bogus MFN? */
+        BUG_ON( !valid_mfn(e->smfn) );
+        pg = mfn_to_page(e->smfn);
+        /* Not a shadow? */
+        BUG_ON( page_get_owner(pg) != 0 );
+        /* Wrong kind of shadow? */
+        BUG_ON( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift 
+                != e->t ); 
+        /* Bad backlink? */
+        BUG_ON( pg->u.inuse.type_info != e->n );
+        if ( e->t != (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
+             && e->t != (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
+             && e->t != (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift) )
+        {
+            /* Bad shadow flags on guest page? */
+            BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow2_flags & (1<<e->t)) );
+        }
+        /* That entry was OK; on we go */
+        e = e->next;
+    }
+}
+
+#else
+#define sh2_hash_audit_bucket(_d, _b)
+#endif /* Hashtable bucket audit */
+
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_HASH_FULL
+
+static void sh2_hash_audit(struct domain *d)
+/* Full audit: audit every bucket in the table */
+{
+    int i;
+
+    if ( !(SHADOW2_AUDIT_ENABLE) )
+        return;
+
+    for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ ) 
+    {
+        sh2_hash_audit_bucket(d, i);
+    }
+}
+
+#else
+#define sh2_hash_audit(_d)
+#endif /* Hashtable bucket audit */
+
+/* Memory management interface for bucket allocation.
+ * These ought to come out of shadow memory, but at least on 32-bit
+ * machines we are forced to allocate them from xenheap so that we can
+ * address them. */
+static struct shadow2_hash_entry *sh2_alloc_hash_entry(struct domain *d)
+{
+    struct shadow2_hash_entry *extra, *x;
+    int i;
+
+    /* We need to allocate a new node. Ensure the free list is not empty. 
+     * Allocate new entries in units the same size as the original table. */
+    if ( unlikely(d->arch.shadow2_hash_freelist == NULL) )
+    {
+        size_t sz = sizeof(void *) + (SHADOW2_HASH_BUCKETS * sizeof(*x));
+        extra = xmalloc_bytes(sz);
+
+        if ( extra == NULL )
+        {
+            /* No memory left! */
+            SHADOW2_ERROR("xmalloc() failed when allocating hash buckets.\n");
+            domain_crash_synchronous();
+        }
+        memset(extra, 0, sz);
+
+        /* Record the allocation block so it can be correctly freed later. */
+        *((struct shadow2_hash_entry **)&extra[SHADOW2_HASH_BUCKETS]) = 
+            d->arch.shadow2_hash_allocations;
+        d->arch.shadow2_hash_allocations = &extra[0];
+
+        /* Thread a free chain through the newly-allocated nodes. */
+        for ( i = 0; i < (SHADOW2_HASH_BUCKETS - 1); i++ )
+            extra[i].next = &extra[i+1];
+        extra[i].next = NULL;
+
+        /* Add the new nodes to the free list. */
+        d->arch.shadow2_hash_freelist = &extra[0];
+    }
+
+    /* Allocate a new node from the free list. */
+    x = d->arch.shadow2_hash_freelist;
+    d->arch.shadow2_hash_freelist = x->next;
+    return x;
+}
+
+static void sh2_free_hash_entry(struct domain *d, struct shadow2_hash_entry *e)
+{
+    /* Mark the bucket as empty and return it to the free list */
+    e->t = 0; 
+    e->next = d->arch.shadow2_hash_freelist;
+    d->arch.shadow2_hash_freelist = e;
+}
+
+
+/* Allocate and initialise the table itself.  
+ * Returns 0 for success, 1 for error. */
+static int shadow2_hash_alloc(struct domain *d)
+{
+    struct shadow2_hash_entry *table;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(!d->arch.shadow2_hash_table);
+
+    table = xmalloc_array(struct shadow2_hash_entry, SHADOW2_HASH_BUCKETS);
+    if ( !table ) return 1;
+    memset(table, 0, 
+           SHADOW2_HASH_BUCKETS * sizeof (struct shadow2_hash_entry));
+    d->arch.shadow2_hash_table = table;
+    return 0;
+}
+
+/* Tear down the hash table and return all memory to Xen.
+ * This function does not care whether the table is populated. */
+static void shadow2_hash_teardown(struct domain *d)
+{
+    struct shadow2_hash_entry *a, *n;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(d->arch.shadow2_hash_table);
+
+    /* Return the table itself */
+    xfree(d->arch.shadow2_hash_table);
+    d->arch.shadow2_hash_table = NULL;
+
+    /* Return any extra allocations */
+    a = d->arch.shadow2_hash_allocations;
+    while ( a ) 
+    {
+        /* We stored a linked-list pointer at the end of each allocation */
+        n = *((struct shadow2_hash_entry **)(&a[SHADOW2_HASH_BUCKETS]));
+        xfree(a);
+        a = n;
+    }
+    d->arch.shadow2_hash_allocations = NULL;
+    d->arch.shadow2_hash_freelist = NULL;
+}
+
+
+mfn_t shadow2_hash_lookup(struct vcpu *v, unsigned long n, u8 t)
+/* Find an entry in the hash table.  Returns the MFN of the shadow,
+ * or INVALID_MFN if it doesn't exist */
+{
+    struct domain *d = v->domain;
+    struct shadow2_hash_entry *p, *x, *head;
+    key_t key;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(d->arch.shadow2_hash_table);
+    ASSERT(t);
+
+    sh2_hash_audit(d);
+
+    perfc_incrc(shadow2_hash_lookups);
+    key = sh2_hash(n, t);
+
+    x = head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS];
+    p = NULL;
+
+    sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+
+    do
+    {
+        ASSERT(x->t || ((x == head) && (x->next == NULL)));
+
+        if ( x->n == n && x->t == t )
+        {
+            /* Pull-to-front if 'x' isn't already the head item */
+            if ( unlikely(x != head) )
+            {
+                if ( unlikely(d->arch.shadow2_hash_walking != 0) )
+                    /* Can't reorder: someone is walking the hash chains */
+                    return x->smfn;
+                else 
+                {
+                    /* Delete 'x' from list and reinsert after head. */
+                    p->next = x->next;
+                    x->next = head->next;
+                    head->next = x;
+                    
+                    /* Swap 'x' contents with head contents. */
+                    SWAP(head->n, x->n);
+                    SWAP(head->t, x->t);
+                    SWAP(head->smfn, x->smfn);
+                }
+            }
+            else
+            {
+                perfc_incrc(shadow2_hash_lookup_head);
+            }
+            return head->smfn;
+        }
+
+        p = x;
+        x = x->next;
+    }
+    while ( x != NULL );
+
+    perfc_incrc(shadow2_hash_lookup_miss);
+    return _mfn(INVALID_MFN);
+}
+
+void shadow2_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
+/* Put a mapping (n,t)->smfn into the hash table */
+{
+    struct domain *d = v->domain;
+    struct shadow2_hash_entry *x, *head;
+    key_t key;
+    
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(d->arch.shadow2_hash_table);
+    ASSERT(t);
+
+    sh2_hash_audit(d);
+
+    perfc_incrc(shadow2_hash_inserts);
+    key = sh2_hash(n, t);
+
+    head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS];
+
+    sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+
+    /* If the bucket is empty then insert the new page as the head item. */
+    if ( head->t == 0 )
+    {
+        head->n = n;
+        head->t = t;
+        head->smfn = smfn;
+        ASSERT(head->next == NULL);
+    }
+    else 
+    {
+        /* Insert a new entry directly after the head item. */
+        x = sh2_alloc_hash_entry(d);
+        x->n = n; 
+        x->t = t;
+        x->smfn = smfn;
+        x->next = head->next;
+        head->next = x;
+    }
+    
+    sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+}
+
+void shadow2_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
+/* Excise the mapping (n,t)->smfn from the hash table */
+{
+    struct domain *d = v->domain;
+    struct shadow2_hash_entry *p, *x, *head;
+    key_t key;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(d->arch.shadow2_hash_table);
+    ASSERT(t);
+
+    sh2_hash_audit(d);
+
+    perfc_incrc(shadow2_hash_deletes);
+    key = sh2_hash(n, t);
+
+    head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS];
+
+    sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+
+    /* Match on head item? */
+    if ( head->n == n && head->t == t )
+    {
+        if ( (x = head->next) != NULL )
+        {
+            /* Overwrite head with contents of following node. */
+            head->n = x->n;
+            head->t = x->t;
+            head->smfn = x->smfn;
+
+            /* Delete following node. */
+            head->next = x->next;
+            sh2_free_hash_entry(d, x);
+        }
+        else
+        {
+            /* This bucket is now empty. Initialise the head node. */
+            head->t = 0;
+        }
+    }
+    else 
+    {
+        /* Not at the head; need to walk the chain */
+        p = head;
+        x = head->next; 
+        
+        while(1)
+        {
+            ASSERT(x); /* We can't have hit the end, since our target is
+                        * still in the chain somehwere... */
+            if ( x->n == n && x->t == t )
+            {
+                /* Delete matching node. */
+                p->next = x->next;
+                sh2_free_hash_entry(d, x);
+                break;
+            }
+            p = x;
+            x = x->next;
+        }
+    }
+
+    sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+}
+
+typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
+
+static void hash_foreach(struct vcpu *v, 
+                         unsigned int callback_mask, 
+                         hash_callback_t callbacks[], 
+                         mfn_t callback_mfn)
+/* Walk the hash table looking at the types of the entries and 
+ * calling the appropriate callback function for each entry. 
+ * The mask determines which shadow types we call back for, and the array
+ * of callbacks tells us which function to call.
+ * Any callback may return non-zero to let us skip the rest of the scan. 
+ *
+ * WARNING: Callbacks MUST NOT add or remove hash entries unless they 
+ * then return non-zero to terminate the scan. */
+{
+    int i, done = 0;
+    struct domain *d = v->domain;
+    struct shadow2_hash_entry *x;
+
+    /* Say we're here, to stop hash-lookups reordering the chains */
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(d->arch.shadow2_hash_walking == 0);
+    d->arch.shadow2_hash_walking = 1;
+
+    callback_mask &= ~1; /* Never attempt to call back on empty buckets */
+    for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ ) 
+    {
+        /* WARNING: This is not safe against changes to the hash table.
+         * The callback *must* return non-zero if it has inserted or
+         * deleted anything from the hash (lookups are OK, though). */
+        for ( x = &d->arch.shadow2_hash_table[i]; x; x = x->next )
+        {
+            if ( callback_mask & (1 << x->t) ) 
+            {
+                ASSERT(x->t <= 15);
+                ASSERT(callbacks[x->t] != NULL);
+                if ( (done = callbacks[x->t](v, x->smfn, callback_mfn)) != 0 )
+                    break;
+            }
+        }
+        if ( done ) break; 
+    }
+    d->arch.shadow2_hash_walking = 0; 
+}
+
+
+/**************************************************************************/
+/* Destroy a shadow page: simple dispatcher to call the per-type destructor
+ * which will decrement refcounts appropriately and return memory to the 
+ * free pool. */
+
+void sh2_destroy_shadow(struct vcpu *v, mfn_t smfn)
+{
+    struct page_info *pg = mfn_to_page(smfn);
+    u32 t = pg->count_info & PGC_SH2_type_mask;
+
+
+    SHADOW2_PRINTK("smfn=%#lx\n", mfn_x(smfn));
+
+    /* Double-check, if we can, that the shadowed page belongs to this
+     * domain, (by following the back-pointer). */
+    ASSERT(t == PGC_SH2_fl1_32_shadow  ||  
+           t == PGC_SH2_fl1_pae_shadow ||  
+           t == PGC_SH2_fl1_64_shadow  || 
+           t == PGC_SH2_monitor_table  || 
+           (page_get_owner(mfn_to_page(_mfn(pg->u.inuse.type_info))) 
+            == v->domain)); 
+
+    /* The down-shifts here are so that the switch statement is on nice
+     * small numbers that the compiler will enjoy */
+    switch ( t >> PGC_SH2_type_shift )
+    {
+#if CONFIG_PAGING_LEVELS == 2
+    case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift:
+    case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 2, 2)(v, smfn); 
+        break;
+    case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 2, 2)(v, smfn);
+        break;
+#else /* PAE or 64bit */
+    case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift:
+    case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 2)(v, smfn);
+        break;
+    case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 2)(v, smfn);
+        break;
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 3
+    case PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift:
+    case PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 3)(v, smfn);
+        break;
+    case PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift:
+    case PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 3)(v, smfn);
+        break;
+    case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 3, 3)(v, smfn);
+        break;
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 4
+    case PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift:
+    case PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 4, 4)(v, smfn);
+        break;
+    case PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 4, 4)(v, smfn);
+        break;
+    case PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 4, 4)(v, smfn);
+        break;
+    case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l4_shadow, 4, 4)(v, smfn);
+        break;
+#endif
+    default:
+        SHADOW2_PRINTK("tried to destroy shadow of bad type %08lx\n", 
+                       (unsigned long)t);
+        BUG();
+    }    
+}
+
+/**************************************************************************/
+/* Remove all writeable mappings of a guest frame from the shadow tables 
+ * Returns non-zero if we need to flush TLBs. 
+ * level and fault_addr desribe how we found this to be a pagetable;
+ * level==0 means we have some other reason for revoking write access.*/
+
+int shadow2_remove_write_access(struct vcpu *v, mfn_t gmfn, 
+                                unsigned int level,
+                                unsigned long fault_addr)
+{
+    /* Dispatch table for getting per-type functions */
+    static hash_callback_t callbacks[16] = {
+        NULL, /* none    */
+#if CONFIG_PAGING_LEVELS == 2
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* l1_32   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* fl1_32  */
+#else 
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* l1_32   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* fl1_32  */
+#endif
+        NULL, /* l2_32   */
+#if CONFIG_PAGING_LEVELS >= 3
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* l1_pae  */
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* fl1_pae */
+#else 
+        NULL, /* l1_pae  */
+        NULL, /* fl1_pae */
+#endif
+        NULL, /* l2_pae  */
+        NULL, /* l2h_pae */
+        NULL, /* l3_pae  */
+#if CONFIG_PAGING_LEVELS >= 4
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* l1_64   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* fl1_64  */
+#else
+        NULL, /* l1_64   */
+        NULL, /* fl1_64  */
+#endif
+        NULL, /* l2_64   */
+        NULL, /* l3_64   */
+        NULL, /* l4_64   */
+        NULL, /* p2m     */
+        NULL  /* unused  */
+    };
+
+    static unsigned int callback_mask = 
+          1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift)
+        ;
+    struct page_info *pg = mfn_to_page(gmfn);
+
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+
+    /* Only remove writable mappings if we are doing shadow refcounts.
+     * In guest refcounting, we trust Xen to already be restricting
+     * all the writes to the guest page tables, so we do not need to
+     * do more. */
+    if ( !shadow2_mode_refcounts(v->domain) )
+        return 0;
+
+    /* Early exit if it's already a pagetable, or otherwise not writeable */
+    if ( sh2_mfn_is_a_page_table(gmfn) 
+         || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
+        return 0;
+
+    perfc_incrc(shadow2_writeable);
+
+    /* If this isn't a "normal" writeable page, the domain is trying to 
+     * put pagetables in special memory of some kind.  We can't allow that. */
+    if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
+    {
+        SHADOW2_ERROR("can't remove write access to mfn %lx, type_info is %" 
+                      PRtype_info "\n",
+                      mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
+        domain_crash(v->domain);
+    }
+
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC
+    if ( v == current && level != 0 )
+    {
+        unsigned long gfn;
+        /* Heuristic: there is likely to be only one writeable mapping,
+         * and that mapping is likely to be in the current pagetable,
+         * either in the guest's linear map (linux, windows) or in a
+         * magic slot used to map high memory regions (linux HIGHTPTE) */
+
+#define GUESS(_a, _h) do {                                              \
+            if ( v->arch.shadow2->guess_wrmap(v, (_a), gmfn) )          \
+                perfc_incrc(shadow2_writeable_h_ ## _h);                \
+            if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )        \
+                return 1;                                               \
+        } while (0)
+
+        
+        /* Linux lowmem: first 1GB is mapped 1-to-1 above 0xC0000000 */
+        if ( v == current 
+             && (gfn = sh2_mfn_to_gfn(v->domain, gmfn)) < 0x40000000 )
+            GUESS(0xC0000000 + (gfn << PAGE_SHIFT), 4);
+
+        if ( v->arch.shadow2->guest_levels == 2 )
+        {
+            if ( level == 1 )
+                /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
+                GUESS(0xC0000000UL + (fault_addr >> 10), 1);
+        }
+#if CONFIG_PAGING_LEVELS >= 3
+        else if ( v->arch.shadow2->guest_levels == 3 )
+        {
+            /* 32bit PAE w2k3: linear map at 0xC0000000 */
+            switch ( level ) 
+            {
+            case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
+            case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
+            }
+        }
+#if CONFIG_PAGING_LEVELS >= 4
+        else if ( v->arch.shadow2->guest_levels == 4 )
+        {
+            /* 64bit w2k3: linear map at 0x0000070000000000 */
+            switch ( level ) 
+            {
+            case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break;
+            case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break;
+            case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break;
+            }
+        }
+#endif /* CONFIG_PAGING_LEVELS >= 4 */
+#endif /* CONFIG_PAGING_LEVELS >= 3 */
+
+#undef GUESS
+
+    }
+#endif
+    
+    /* Brute-force search of all the shadows, by walking the hash */
+    perfc_incrc(shadow2_writeable_bf);
+    hash_foreach(v, callback_mask, callbacks, gmfn);
+
+    /* If that didn't catch the mapping, something is very wrong */
+    if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
+    {
+        SHADOW2_ERROR("can't find all writeable mappings of mfn %lx: "
+                      "%lu left\n", mfn_x(gmfn),
+                      (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
+        domain_crash(v->domain);
+    }
+    
+    /* We killed at least one writeable mapping, so must flush TLBs. */
+    return 1;
+}
+
+
+
+/**************************************************************************/
+/* Remove all mappings of a guest frame from the shadow tables.
+ * Returns non-zero if we need to flush TLBs. */
+
+int shadow2_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
+{
+    struct page_info *page = mfn_to_page(gmfn);
+    int expected_count;
+
+    /* Dispatch table for getting per-type functions */
+    static hash_callback_t callbacks[16] = {
+        NULL, /* none    */
+#if CONFIG_PAGING_LEVELS == 2
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* l1_32   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* fl1_32  */
+#else 
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* l1_32   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* fl1_32  */
+#endif
+        NULL, /* l2_32   */
+#if CONFIG_PAGING_LEVELS >= 3
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* l1_pae  */
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* fl1_pae */
+#else 
+        NULL, /* l1_pae  */
+        NULL, /* fl1_pae */
+#endif
+        NULL, /* l2_pae  */
+        NULL, /* l2h_pae */
+        NULL, /* l3_pae  */
+#if CONFIG_PAGING_LEVELS >= 4
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* l1_64   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* fl1_64  */
+#else
+        NULL, /* l1_64   */
+        NULL, /* fl1_64  */
+#endif
+        NULL, /* l2_64   */
+        NULL, /* l3_64   */
+        NULL, /* l4_64   */
+        NULL, /* p2m     */
+        NULL  /* unused  */
+    };
+
+    static unsigned int callback_mask = 
+          1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift)
+        ;
+
+    perfc_incrc(shadow2_mappings);
+    if ( (page->count_info & PGC_count_mask) == 0 )
+        return 0;
+
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+
+    /* XXX TODO: 
+     * Heuristics for finding the (probably) single mapping of this gmfn */
+    
+    /* Brute-force search of all the shadows, by walking the hash */
+    perfc_incrc(shadow2_mappings_bf);
+    hash_foreach(v, callback_mask, callbacks, gmfn);
+
+    /* If that didn't catch the mapping, something is very wrong */
+    expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
+    if ( (page->count_info & PGC_count_mask) != expected_count )
+    {
+        /* Don't complain if we're in HVM and there's one extra mapping: 
+         * The qemu helper process has an untyped mapping of this dom's RAM */
+        if ( !(shadow2_mode_external(v->domain)
+               && (page->count_info & PGC_count_mask) <= 2
+               && (page->u.inuse.type_info & PGT_count_mask) == 0) )
+        {
+            SHADOW2_ERROR("can't find all mappings of mfn %lx: "
+                          "c=%08x t=%08lx\n", mfn_x(gmfn), 
+                          page->count_info, page->u.inuse.type_info);
+        }
+    }
+
+    /* We killed at least one mapping, so must flush TLBs. */
+    return 1;
+}
+
+
+/**************************************************************************/
+/* Remove all shadows of a guest frame from the shadow tables */
+
+static int sh2_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
+/* Follow this shadow's up-pointer, if it has one, and remove the reference
+ * found there.  Returns 1 if that was the only reference to this shadow */
+{
+    struct page_info *pg = mfn_to_page(smfn);
+    mfn_t pmfn;
+    void *vaddr;
+    int rc;
+
+    ASSERT((pg->count_info & PGC_SH2_type_mask) > 0);
+    ASSERT((pg->count_info & PGC_SH2_type_mask) < PGC_SH2_max_shadow);
+    ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l2_32_shadow);
+    ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l3_pae_shadow);
+    ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l4_64_shadow);
+    
+    if (pg->up == 0) return 0;
+    pmfn = _mfn(pg->up >> PAGE_SHIFT);
+    ASSERT(valid_mfn(pmfn));
+    vaddr = sh2_map_domain_page(pmfn);
+    ASSERT(vaddr);
+    vaddr += pg->up & (PAGE_SIZE-1);
+    ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
+    
+    /* Is this the only reference to this shadow? */
+    rc = ((pg->count_info & PGC_SH2_count_mask) == 1) ? 1 : 0;
+
+    /* Blank the offending entry */
+    switch ((pg->count_info & PGC_SH2_type_mask)) 
+    {
+    case PGC_SH2_l1_32_shadow:
+    case PGC_SH2_l2_32_shadow:
+#if CONFIG_PAGING_LEVELS == 2
+        SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,2,2)(v, vaddr, pmfn);
+#else
+        SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,2)(v, vaddr, pmfn);
+#endif
+        break;
+#if CONFIG_PAGING_LEVELS >=3
+    case PGC_SH2_l1_pae_shadow:
+    case PGC_SH2_l2_pae_shadow:
+    case PGC_SH2_l2h_pae_shadow:
+    case PGC_SH2_l3_pae_shadow:
+        SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,3)(v, vaddr, pmfn);
+        break;
+#if CONFIG_PAGING_LEVELS >= 4
+    case PGC_SH2_l1_64_shadow:
+    case PGC_SH2_l2_64_shadow:
+    case PGC_SH2_l3_64_shadow:
+    case PGC_SH2_l4_64_shadow:
+        SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,4,4)(v, vaddr, pmfn);
+        break;
+#endif
+#endif
+    default: BUG(); /* Some wierd unknown shadow type */
+    }
+    
+    sh2_unmap_domain_page(vaddr);
+    if ( rc )
+        perfc_incrc(shadow2_up_pointer);
+    else
+        perfc_incrc(shadow2_unshadow_bf);
+
+    return rc;
+}
+
+void sh2_remove_shadows(struct vcpu *v, mfn_t gmfn, int all)
+/* Remove the shadows of this guest page.  
+ * If all != 0, find all shadows, if necessary by walking the tables.
+ * Otherwise, just try the (much faster) heuristics, which will remove 
+ * at most one reference to each shadow of the page. */
+{
+    struct page_info *pg;
+    mfn_t smfn;
+    u32 sh_flags;
+    unsigned char t;
+
+    /* Dispatch table for getting per-type functions: each level must
+     * be called with the function to remove a lower-level shadow. */
+    static hash_callback_t callbacks[16] = {
+        NULL, /* none    */
+        NULL, /* l1_32   */
+        NULL, /* fl1_32  */
+#if CONFIG_PAGING_LEVELS == 2
+        SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,2,2), /* l2_32   */
+#else 
+        SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,2), /* l2_32   */
+#endif
+        NULL, /* l1_pae  */
+        NULL, /* fl1_pae */
+#if CONFIG_PAGING_LEVELS >= 3
+        SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2_pae  */
+        SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2h_pae */
+        SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,3,3), /* l3_pae  */
+#else 
+        NULL, /* l2_pae  */
+        NULL, /* l2h_pae */
+        NULL, /* l3_pae  */
+#endif
+        NULL, /* l1_64   */
+        NULL, /* fl1_64  */
+#if CONFIG_PAGING_LEVELS >= 4
+        SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,4,4), /* l2_64   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,4,4), /* l3_64   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_l3_shadow,4,4), /* l4_64   */
+#else
+        NULL, /* l2_64   */
+        NULL, /* l3_64   */
+        NULL, /* l4_64   */
+#endif
+        NULL, /* p2m     */
+        NULL  /* unused  */
+    };
+
+    /* Another lookup table, for choosing which mask to use */
+    static unsigned int masks[16] = {
+        0, /* none    */
+        1 << (PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift), /* l1_32   */
+        0, /* fl1_32  */
+        0, /* l2_32   */
+        ((1 << (PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift))
+         | (1 << (PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift))), /* l1_pae  */
+        0, /* fl1_pae */
+        1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2_pae  */
+        1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2h_pae  */
+        0, /* l3_pae  */
+        1 << (PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift), /* l1_64   */
+        0, /* fl1_64  */
+        1 << (PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift), /* l2_64   */
+        1 << (PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift), /* l3_64   */
+        0, /* l4_64   */
+        0, /* p2m     */
+        0  /* unused  */
+    };
+
+    SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
+                   v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
+
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+
+    pg = mfn_to_page(gmfn);
+
+    /* Bale out now if the page is not shadowed */
+    if ( (pg->count_info & PGC_page_table) == 0 )
+        return;
+
+    /* Search for this shadow in all appropriate shadows */
+    perfc_incrc(shadow2_unshadow);
+    sh_flags = pg->shadow2_flags;
+
+    /* Lower-level shadows need to be excised from upper-level shadows.
+     * This call to hash_foreach() looks dangerous but is in fact OK: each
+     * call will remove at most one shadow, and terminate immediately when
+     * it does remove it, so we never walk the hash after doing a deletion.  */
+#define DO_UNSHADOW(_type) do {                                 \
+    t = (_type) >> PGC_SH2_type_shift;                          \
+    smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t);              \
+    if ( !sh2_remove_shadow_via_pointer(v, smfn) && all )       \
+        hash_foreach(v, masks[t], callbacks, smfn);             \
+} while (0)
+
+    /* Top-level shadows need to be unpinned */
+#define DO_UNPIN(_type) do {                                             \
+    t = (_type) >> PGC_SH2_type_shift;                                   \
+    smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t);                       \
+    if ( mfn_to_page(smfn)->count_info & PGC_SH2_pinned )                \
+        sh2_unpin(v, smfn);                                              \
+    if ( (_type) == PGC_SH2_l3_pae_shadow )                              \
+        SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn); \
+} while (0)
+
+    if ( sh_flags & SH2F_L1_32 )   DO_UNSHADOW(PGC_SH2_l1_32_shadow);
+    if ( sh_flags & SH2F_L2_32 )   DO_UNPIN(PGC_SH2_l2_32_shadow);
+#if CONFIG_PAGING_LEVELS >= 3
+    if ( sh_flags & SH2F_L1_PAE )  DO_UNSHADOW(PGC_SH2_l1_pae_shadow);
+    if ( sh_flags & SH2F_L2_PAE )  DO_UNSHADOW(PGC_SH2_l2_pae_shadow);
+    if ( sh_flags & SH2F_L2H_PAE ) DO_UNSHADOW(PGC_SH2_l2h_pae_shadow);
+    if ( sh_flags & SH2F_L3_PAE )  DO_UNPIN(PGC_SH2_l3_pae_shadow);
+#if CONFIG_PAGING_LEVELS >= 4
+    if ( sh_flags & SH2F_L1_64 )   DO_UNSHADOW(PGC_SH2_l1_64_shadow);
+    if ( sh_flags & SH2F_L2_64 )   DO_UNSHADOW(PGC_SH2_l2_64_shadow);
+    if ( sh_flags & SH2F_L3_64 )   DO_UNSHADOW(PGC_SH2_l3_64_shadow);
+    if ( sh_flags & SH2F_L4_64 )   DO_UNPIN(PGC_SH2_l4_64_shadow);
+#endif
+#endif
+
+#undef DO_UNSHADOW
+#undef DO_UNPIN
+
+
+#if CONFIG_PAGING_LEVELS > 2
+    /* We may have caused some PAE l3 entries to change: need to 
+     * fix up the copies of them in various places */
+    if ( sh_flags & (SH2F_L2_PAE|SH2F_L2H_PAE) )
+        sh2_pae_recopy(v->domain);
+#endif
+
+    /* If that didn't catch the shadows, something is wrong */
+    if ( all && (pg->count_info & PGC_page_table) )
+    {
+        SHADOW2_ERROR("can't find all shadows of mfn %05lx (shadow2_flags=%08x)\n",
+                      mfn_x(gmfn), pg->shadow2_flags);
+        domain_crash(v->domain);
+    }
+}
+
+void
+shadow2_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
+/* Even harsher: this is a HVM page that we thing is no longer a pagetable.
+ * Unshadow it, and recursively unshadow pages that reference it. */
+{
+    shadow2_remove_all_shadows(v, gmfn);
+    /* XXX TODO:
+     * Rework this hashtable walker to return a linked-list of all 
+     * the shadows it modified, then do breadth-first recursion 
+     * to find the way up to higher-level tables and unshadow them too. 
+     *
+     * The current code (just tearing down each page's shadows as we
+     * detect that it is not a pagetable) is correct, but very slow. 
+     * It means extra emulated writes and slows down removal of mappings. */
+}
+
+/**************************************************************************/
+
+void sh2_update_paging_modes(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+    struct shadow2_entry_points *old_entries = v->arch.shadow2;
+    mfn_t old_guest_table;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+
+    // Valid transitions handled by this function:
+    // - For PV guests:
+    //     - after a shadow mode has been changed
+    // - For HVM guests:
+    //     - after a shadow mode has been changed
+    //     - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
+    //
+
+    // Avoid determining the current shadow2 mode for uninitialized CPUs, as
+    // we can not yet determine whether it is an HVM or PV domain.
+    //
+    if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
+    {
+        printk("%s: postponing determination of shadow2 mode\n", __func__);
+        return;
+    }
+
+    // First, tear down any old shadow tables held by this vcpu.
+    //
+    if ( v->arch.shadow2 )
+        shadow2_detach_old_tables(v);
+
+    if ( !hvm_guest(v) )
+    {
+        ///
+        /// PV guest
+        ///
+#if CONFIG_PAGING_LEVELS == 4
+        if ( pv_32bit_guest(v) )
+            v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 3);
+        else
+            v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 4);
+#elif CONFIG_PAGING_LEVELS == 3
+        v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 3);
+#elif CONFIG_PAGING_LEVELS == 2
+        v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 2, 2);
+#else
+#error unexpected paging mode
+#endif
+    }
+    else
+    {
+        ///
+        /// HVM guest
+        ///
+        ASSERT(shadow2_mode_translate(d));
+        ASSERT(shadow2_mode_external(d));
+
+        if ( !hvm_paging_enabled(v) )
+        {
+            // paging disabled...
+            clear_bit(_VCPUF_shadow2_translate, &v->vcpu_flags);
+            
+            /* Set v->arch.guest_table to use the p2m map, and choose
+             * the appropriate shadow mode */
+            old_guest_table = pagetable_get_mfn(v->arch.guest_table);
+#if CONFIG_PAGING_LEVELS == 2
+            v->arch.guest_table =
+                pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
+            v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,2,2);
+#elif CONFIG_PAGING_LEVELS == 3 
+            v->arch.guest_table =
+                pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
+            v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,3,3);
+#else /* CONFIG_PAGING_LEVELS == 4 */
+            { 
+                l4_pgentry_t *l4e; 
+                /* Use the start of the first l3 table as a PAE l3 */
+                ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
+                l4e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+                ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
+                v->arch.guest_table =
+                    pagetable_from_pfn(l4e_get_pfn(l4e[0]));
+                sh2_unmap_domain_page(l4e);
+            }
+            v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,3,3);
+#endif
+            /* Fix up refcounts on guest_table */
+            get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
+            if ( mfn_x(old_guest_table) != 0 )
+                put_page(mfn_to_page(old_guest_table));
+        }
+        else
+        {
+            set_bit(_VCPUF_shadow2_translate, &v->vcpu_flags);
+
+#ifdef __x86_64__
+            if ( hvm_long_mode_enabled(v) )
+            {
+                // long mode guest...
+                v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 4);
+            }
+            else
+#endif
+                if ( hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PAE )
+                {
+#if CONFIG_PAGING_LEVELS >= 3
+                    // 32-bit PAE mode guest...
+                    v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 3);
+#else
+                    SHADOW2_ERROR("PAE not supported in 32-bit Xen\n");
+                    domain_crash(d);
+                    return;
+#endif
+                }
+                else
+                {
+                    // 32-bit 2 level guest...
+#if CONFIG_PAGING_LEVELS >= 3
+                    v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 2);
+#else
+                    v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 2, 2);
+#endif
+                }
+        }
+        
+        if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
+        {
+            mfn_t mmfn = shadow2_make_monitor_table(v);
+            v->arch.monitor_table = pagetable_from_mfn(mmfn);
+            v->arch.monitor_vtable = sh2_map_domain_page(mmfn);
+        } 
+
+        if ( v->arch.shadow2 != old_entries )
+        {
+            SHADOW2_PRINTK("new paging mode: d=%u v=%u g=%u s=%u "
+                           "(was g=%u s=%u)\n",
+                           d->domain_id, v->vcpu_id, 
+                           v->arch.shadow2->guest_levels,
+                           v->arch.shadow2->shadow_levels,
+                           old_entries ? old_entries->guest_levels : 0,
+                           old_entries ? old_entries->shadow_levels : 0);
+            if ( old_entries &&
+                 (v->arch.shadow2->shadow_levels !=
+                  old_entries->shadow_levels) )
+            {
+                /* Need to make a new monitor table for the new mode */
+                mfn_t new_mfn, old_mfn;
+
+                if ( v != current ) 
+                {
+                    SHADOW2_ERROR("Some third party (d=%u v=%u) is changing "
+                                  "this HVM vcpu's (d=%u v=%u) paging mode!\n",
+                                  current->domain->domain_id, current->vcpu_id,
+                                  v->domain->domain_id, v->vcpu_id);
+                    domain_crash(v->domain);
+                    return;
+                }
+
+                sh2_unmap_domain_page(v->arch.monitor_vtable);
+                old_mfn = pagetable_get_mfn(v->arch.monitor_table);
+                v->arch.monitor_table = pagetable_null();
+                new_mfn = v->arch.shadow2->make_monitor_table(v);            
+                v->arch.monitor_table = pagetable_from_mfn(new_mfn);
+                v->arch.monitor_vtable = sh2_map_domain_page(new_mfn);
+                SHADOW2_PRINTK("new monitor table %"SH2_PRI_mfn "\n",
+                               mfn_x(new_mfn));
+
+                /* Don't be running on the old monitor table when we 
+                 * pull it down!  Switch CR3, and warn the HVM code that
+                 * its host cr3 has changed. */
+                make_cr3(v, mfn_x(new_mfn));
+                write_ptbase(v);
+                hvm_update_host_cr3(v);
+                old_entries->destroy_monitor_table(v, old_mfn);
+            }
+        }
+
+        // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
+        //        These are HARD: think about the case where two CPU's have
+        //        different values for CR4.PSE and CR4.PGE at the same time.
+        //        This *does* happen, at least for CR4.PGE...
+    }
+
+    v->arch.shadow2->update_cr3(v);
+}
+
+/**************************************************************************/
+/* Turning on and off shadow2 features */
+
+static void sh2_new_mode(struct domain *d, u32 new_mode)
+/* Inform all the vcpus that the shadow mode has been changed */
+{
+    struct vcpu *v;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(d != current->domain);
+    d->arch.shadow2_mode = new_mode;
+    if ( new_mode & SHM2_translate ) 
+        shadow2_audit_p2m(d);
+    for_each_vcpu(d, v)
+        sh2_update_paging_modes(v);
+}
+
+static int shadow2_enable(struct domain *d, u32 mode)
+/* Turn on "permanent" shadow features: external, translate, refcount.
+ * Can only be called once on a domain, and these features cannot be
+ * disabled. 
+ * Returns 0 for success, -errno for failure. */
+{    
+    unsigned int old_pages;
+    int rv = 0;
+
+    domain_pause(d);
+    shadow2_lock(d);
+
+    /* Sanity check the arguments */
+    if ( d == current->domain 
+         || shadow2_mode_enabled(d)
+         || !(mode & SHM2_enable)
+         || ((mode & SHM2_external) && !(mode & SHM2_translate)) )
+    {
+        rv = -EINVAL;
+        goto out;
+    }
+
+    // XXX -- eventually would like to require that all memory be allocated
+    // *after* shadow2_enabled() is called...  So here, we would test to make
+    // sure that d->page_list is empty.
+#if 0
+    spin_lock(&d->page_alloc_lock);
+    if ( !list_empty(&d->page_list) )
+    {
+        spin_unlock(&d->page_alloc_lock);
+        rv = -EINVAL;
+        goto out;
+    }
+    spin_unlock(&d->page_alloc_lock);
+#endif
+
+    /* Init the shadow memory allocation if the user hasn't done so */
+    old_pages = d->arch.shadow2_total_pages;
+    if ( old_pages == 0 )
+        if ( set_sh2_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */
+        {
+            set_sh2_allocation(d, 0, NULL);
+            rv = -ENOMEM;
+            goto out;
+        }
+
+    /* Init the hash table */
+    if ( shadow2_hash_alloc(d) != 0 )
+    {
+        set_sh2_allocation(d, old_pages, NULL);            
+        rv = -ENOMEM;
+        goto out;
+    }
+
+    /* Init the P2M table */
+    if ( mode & SHM2_translate )
+        if ( !shadow2_alloc_p2m_table(d) )
+        {
+            shadow2_hash_teardown(d);
+            set_sh2_allocation(d, old_pages, NULL);
+            shadow2_p2m_teardown(d);
+            rv = -ENOMEM;
+            goto out;
+        }
+
+    /* Update the bits */
+    sh2_new_mode(d, mode);
+    shadow2_audit_p2m(d);
+ out:
+    shadow2_unlock(d);
+    domain_unpause(d);
+    return 0;
+}
+
+void shadow2_teardown(struct domain *d)
+/* Destroy the shadow pagetables of this domain and free its shadow memory.
+ * Should only be called for dying domains. */
+{
+    struct vcpu *v;
+    mfn_t mfn;
+
+    ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
+    ASSERT(d != current->domain);
+
+    if ( !shadow2_lock_is_acquired(d) )
+        shadow2_lock(d); /* Keep various asserts happy */
+
+    if ( shadow2_mode_enabled(d) )
+    {
+        /* Release the shadow and monitor tables held by each vcpu */
+        for_each_vcpu(d, v)
+        {
+            if ( v->arch.shadow2 )
+                shadow2_detach_old_tables(v);
+            if ( shadow2_mode_external(d) )
+            {
+                mfn = pagetable_get_mfn(v->arch.monitor_table);
+                if ( valid_mfn(mfn) && (mfn_x(mfn) != 0) )
+                    shadow2_destroy_monitor_table(v, mfn);
+                v->arch.monitor_table = pagetable_null();
+            }
+        }
+    }
+
+    if ( d->arch.shadow2_total_pages != 0 )
+    {
+        SHADOW2_PRINTK("teardown of domain %u starts."
+                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
+                       d->domain_id,
+                       d->arch.shadow2_total_pages, 
+                       d->arch.shadow2_free_pages, 
+                       d->arch.shadow2_p2m_pages);
+        /* Destroy all the shadows and release memory to domheap */
+        set_sh2_allocation(d, 0, NULL);
+        /* Release the hash table back to xenheap */
+        if (d->arch.shadow2_hash_table) 
+            shadow2_hash_teardown(d);
+        /* Release the log-dirty bitmap of dirtied pages */
+        sh2_free_log_dirty_bitmap(d);
+        /* Should not have any more memory held */
+        SHADOW2_PRINTK("teardown done."
+                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
+                       d->arch.shadow2_total_pages, 
+                       d->arch.shadow2_free_pages, 
+                       d->arch.shadow2_p2m_pages);
+        ASSERT(d->arch.shadow2_total_pages == 0);
+    }
+
+    /* We leave the "permanent" shadow modes enabled, but clear the
+     * log-dirty mode bit.  We don't want any more mark_dirty()
+     * calls now that we've torn down the bitmap */
+    d->arch.shadow2_mode &= ~SHM2_log_dirty;
+
+    shadow2_unlock(d);
+}
+
+void shadow2_final_teardown(struct domain *d)
+/* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
+{
+
+    SHADOW2_PRINTK("dom %u final teardown starts."
+                   "  Shadow pages total = %u, free = %u, p2m=%u\n",
+                   d->domain_id,
+                   d->arch.shadow2_total_pages, 
+                   d->arch.shadow2_free_pages, 
+                   d->arch.shadow2_p2m_pages);
+
+    /* Double-check that the domain didn't have any shadow memory.  
+     * It is possible for a domain that never got domain_kill()ed
+     * to get here with its shadow allocation intact. */
+    if ( d->arch.shadow2_total_pages != 0 )
+        shadow2_teardown(d);
+
+    /* It is now safe to pull down the p2m map. */
+    if ( d->arch.shadow2_p2m_pages != 0 )
+        shadow2_p2m_teardown(d);
+
+    SHADOW2_PRINTK("dom %u final teardown done."
+                   "  Shadow pages total = %u, free = %u, p2m=%u\n",
+                   d->domain_id,
+                   d->arch.shadow2_total_pages, 
+                   d->arch.shadow2_free_pages, 
+                   d->arch.shadow2_p2m_pages);
+}
+
+static int shadow2_one_bit_enable(struct domain *d, u32 mode)
+/* Turn on a single shadow mode feature */
+{
+    ASSERT(shadow2_lock_is_acquired(d));
+
+    /* Sanity check the call */
+    if ( d == current->domain || (d->arch.shadow2_mode & mode) )
+    {
+        return -EINVAL;
+    }
+
+    if ( d->arch.shadow2_mode == 0 )
+    {
+        /* Init the shadow memory allocation and the hash table */
+        if ( set_sh2_allocation(d, 1, NULL) != 0 
+             || shadow2_hash_alloc(d) != 0 )
+        {
+            set_sh2_allocation(d, 0, NULL);
+            return -ENOMEM;
+        }
+    }
+
+    /* Update the bits */
+    sh2_new_mode(d, d->arch.shadow2_mode | mode);
+
+    return 0;
+}
+
+static int shadow2_one_bit_disable(struct domain *d, u32 mode) 
+/* Turn off a single shadow mode feature */
+{
+    struct vcpu *v;
+    ASSERT(shadow2_lock_is_acquired(d));
+
+    /* Sanity check the call */
+    if ( d == current->domain || !(d->arch.shadow2_mode & mode) )
+    {
+        return -EINVAL;
+    }
+
+    /* Update the bits */
+    sh2_new_mode(d, d->arch.shadow2_mode & ~mode);
+    if ( d->arch.shadow2_mode == 0 )
+    {
+        /* Get this domain off shadows */
+        SHADOW2_PRINTK("un-shadowing of domain %u starts."
+                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
+                       d->domain_id,
+                       d->arch.shadow2_total_pages, 
+                       d->arch.shadow2_free_pages, 
+                       d->arch.shadow2_p2m_pages);
+        for_each_vcpu(d, v)
+        {
+            if ( v->arch.shadow2 )
+                shadow2_detach_old_tables(v);
+#if CONFIG_PAGING_LEVELS == 4
+            if ( !(v->arch.flags & TF_kernel_mode) )
+                make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
+            else
+#endif
+                make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
+
+        }
+
+        /* Pull down the memory allocation */
+        if ( set_sh2_allocation(d, 0, NULL) != 0 )
+        {
+            // XXX - How can this occur?
+            //       Seems like a bug to return an error now that we've
+            //       disabled the relevant shadow mode.
+            //
+            return -ENOMEM;
+        }
+        shadow2_hash_teardown(d);
+        SHADOW2_PRINTK("un-shadowing of domain %u done."
+                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
+                       d->domain_id,
+                       d->arch.shadow2_total_pages, 
+                       d->arch.shadow2_free_pages, 
+                       d->arch.shadow2_p2m_pages);
+    }
+
+    return 0;
+}
+
+/* Enable/disable ops for the "test" and "log-dirty" modes */
+int shadow2_test_enable(struct domain *d)
+{
+    int ret;
+
+    domain_pause(d);
+    shadow2_lock(d);
+
+    if ( shadow2_mode_enabled(d) )
+    {
+        SHADOW2_ERROR("Don't support enabling test mode"
+                      "on already shadowed doms\n");
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ret = shadow2_one_bit_enable(d, SHM2_enable);
+ out:
+    shadow2_unlock(d);
+    domain_unpause(d);
+
+    return ret;
+}
+
+int shadow2_test_disable(struct domain *d)
+{
+    int ret;
+
+    domain_pause(d);
+    shadow2_lock(d);
+    ret = shadow2_one_bit_disable(d, SHM2_enable);
+    shadow2_unlock(d);
+    domain_unpause(d);
+
+    return ret;
+}
+
+static int
+sh2_alloc_log_dirty_bitmap(struct domain *d)
+{
+    ASSERT(d->arch.shadow_dirty_bitmap == NULL);
+    d->arch.shadow_dirty_bitmap_size =
+        (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) &
+        ~(BITS_PER_LONG - 1);
+    d->arch.shadow_dirty_bitmap =
+        xmalloc_array(unsigned long,
+                      d->arch.shadow_dirty_bitmap_size / BITS_PER_LONG);
+    if ( d->arch.shadow_dirty_bitmap == NULL )
+    {
+        d->arch.shadow_dirty_bitmap_size = 0;
+        return -ENOMEM;
+    }
+    memset(d->arch.shadow_dirty_bitmap, 0, d->arch.shadow_dirty_bitmap_size/8);
+
+    return 0;
+}
+
+static void
+sh2_free_log_dirty_bitmap(struct domain *d)
+{
+    d->arch.shadow_dirty_bitmap_size = 0;
+    if ( d->arch.shadow_dirty_bitmap )
+    {
+        xfree(d->arch.shadow_dirty_bitmap);
+        d->arch.shadow_dirty_bitmap = NULL;
+    }
+}
+
+static int shadow2_log_dirty_enable(struct domain *d)
+{
+    int ret;
+
+    domain_pause(d);
+    shadow2_lock(d);
+
+    if ( shadow2_mode_log_dirty(d) )
+    {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    if ( shadow2_mode_enabled(d) )
+    {
+        SHADOW2_ERROR("Don't (yet) support enabling log-dirty"
+                      "on already shadowed doms\n");
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ret = sh2_alloc_log_dirty_bitmap(d);
+    if ( ret != 0 )
+    {
+        sh2_free_log_dirty_bitmap(d);
+        goto out;
+    }
+
+    ret = shadow2_one_bit_enable(d, SHM2_log_dirty);
+    if ( ret != 0 )
+        sh2_free_log_dirty_bitmap(d);
+
+ out:
+    shadow2_unlock(d);
+    domain_unpause(d);
+    return ret;
+}
+
+static int shadow2_log_dirty_disable(struct domain *d)
+{
+    int ret;
+
+    domain_pause(d);
+    shadow2_lock(d);
+    ret = shadow2_one_bit_disable(d, SHM2_log_dirty);
+    if ( !shadow2_mode_log_dirty(d) )
+        sh2_free_log_dirty_bitmap(d);
+    shadow2_unlock(d);
+    domain_unpause(d);
+
+    return ret;
+}
+
+/**************************************************************************/
+/* P2M map manipulations */
+
+static void
+sh2_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
+{
+    struct vcpu *v;
+
+    if ( !shadow2_mode_translate(d) )
+        return;
+
+    v = current;
+    if ( v->domain != d )
+        v = d->vcpu[0];
+
+
+    SHADOW2_PRINTK("removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
+
+    ASSERT(mfn_x(sh2_gfn_to_mfn(d, gfn)) == mfn);
+    //ASSERT(sh2_mfn_to_gfn(d, mfn) == gfn);
+
+    shadow2_remove_all_shadows_and_parents(v, _mfn(mfn));
+    if ( shadow2_remove_all_mappings(v, _mfn(mfn)) )
+        flush_tlb_mask(d->domain_dirty_cpumask);
+    shadow2_set_p2m_entry(d, gfn, _mfn(INVALID_MFN));
+    set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+}
+
+void
+shadow2_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
+                                  unsigned long mfn)
+{
+    shadow2_lock(d);
+    shadow2_audit_p2m(d);
+    sh2_p2m_remove_page(d, gfn, mfn);
+    shadow2_audit_p2m(d);
+    shadow2_unlock(d);    
+}
+
+void
+shadow2_guest_physmap_add_page(struct domain *d, unsigned long gfn,
+                               unsigned long mfn)
+{
+    struct vcpu *v;
+    unsigned long ogfn;
+    mfn_t omfn;
+
+    if ( !shadow2_mode_translate(d) )
+        return;
+
+    v = current;
+    if ( v->domain != d )
+        v = d->vcpu[0];
+
+    shadow2_lock(d);
+    shadow2_audit_p2m(d);
+
+    SHADOW2_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
+
+    omfn = sh2_gfn_to_mfn(d, gfn);
+    if ( valid_mfn(omfn) )
+    {
+        /* Get rid of the old mapping, especially any shadows */
+        shadow2_remove_all_shadows_and_parents(v, omfn);
+        if ( shadow2_remove_all_mappings(v, omfn) )
+            flush_tlb_mask(d->domain_dirty_cpumask);
+        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+    }        
+
+    ogfn = sh2_mfn_to_gfn(d, _mfn(mfn));
+    if (
+#ifdef __x86_64__
+        (ogfn != 0x5555555555555555L)
+#else
+        (ogfn != 0x55555555L)
+#endif
+        && (ogfn != INVALID_M2P_ENTRY)
+        && (ogfn != gfn) )
+    {
+        /* This machine frame is already mapped at another physical address */
+        SHADOW2_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
+                       mfn, ogfn, gfn);
+        if ( valid_mfn(omfn = sh2_gfn_to_mfn(d, ogfn)) ) 
+        {
+            SHADOW2_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n", 
+                           ogfn , mfn_x(omfn));
+            if ( mfn_x(omfn) == mfn ) 
+                sh2_p2m_remove_page(d, ogfn, mfn);
+        }
+    }
+
+    shadow2_set_p2m_entry(d, gfn, _mfn(mfn));
+    set_gpfn_from_mfn(mfn, gfn);
+    shadow2_audit_p2m(d);
+    shadow2_unlock(d);
+}
+
+/**************************************************************************/
+/* Log-dirty mode support */
+
+/* Convert a shadow to log-dirty mode. */
+void shadow2_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
+{
+    BUG();
+}
+
+
+/* Read a domain's log-dirty bitmap and stats.  
+ * If the operation is a CLEAN, clear the bitmap and stats as well. */
+static int shadow2_log_dirty_op(struct domain *d, dom0_shadow_control_t *sc)
+{    
+    int i, rv = 0, clean = 0;
+
+    domain_pause(d);
+    shadow2_lock(d);
+
+    if ( sc->op == DOM0_SHADOW_CONTROL_OP_CLEAN
+         || sc->op == DOM0_SHADOW_CONTROL_OP_FLUSH ) 
+        clean = 1;
+    else 
+        ASSERT(sc->op == DOM0_SHADOW_CONTROL_OP_PEEK); 
+
+    SHADOW2_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n", 
+                  (clean) ? "clean" : "peek",
+                  d->domain_id,
+                  d->arch.shadow_fault_count, 
+                  d->arch.shadow_dirty_count);
+
+    sc->stats.fault_count = d->arch.shadow_fault_count;
+    sc->stats.dirty_count = d->arch.shadow_dirty_count;    
+        
+    if ( clean ) 
+    {
+        struct list_head *l, *t;
+        struct page_info *pg;
+
+        /* Need to revoke write access to the domain's pages again. 
+         * In future, we'll have a less heavy-handed approach to this, 
+         * but for now, we just unshadow everything except Xen. */
+        list_for_each_safe(l, t, &d->arch.shadow2_toplevel_shadows)
+        {
+            pg = list_entry(l, struct page_info, list);
+            shadow2_unhook_mappings(d->vcpu[0], page_to_mfn(pg));
+        }
+
+        d->arch.shadow_fault_count = 0;
+        d->arch.shadow_dirty_count = 0;
+    }
+
+    if ( guest_handle_is_null(sc->dirty_bitmap) ||
+         (d->arch.shadow_dirty_bitmap == NULL) )
+    {
+        rv = -EINVAL;
+        goto out;
+    }
+    if ( sc->pages > d->arch.shadow_dirty_bitmap_size )
+        sc->pages = d->arch.shadow_dirty_bitmap_size; 
+
+#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
+    for ( i = 0; i < sc->pages; i += CHUNK )
+    {
+        int bytes = ((((sc->pages - i) > CHUNK) 
+                      ? CHUNK 
+                      : (sc->pages - i)) + 7) / 8;
+     
+        if ( copy_to_guest_offset(
+                 sc->dirty_bitmap, 
+                 i/(8*sizeof(unsigned long)),
+                 d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
+                 (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) )
+        {
+            rv = -EINVAL;
+            goto out;
+        }
+
+        if ( clean )
+            memset(d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
+                   0, bytes);
+    }
+#undef CHUNK
+
+ out:
+    shadow2_unlock(d);
+    domain_unpause(d);
+    return 0;
+}
+
+
+/* Mark a page as dirty */
+void sh2_do_mark_dirty(struct domain *d, mfn_t gmfn)
+{
+    unsigned long pfn;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(shadow2_mode_log_dirty(d));
+
+    if ( !valid_mfn(gmfn) )
+        return;
+
+    ASSERT(d->arch.shadow_dirty_bitmap != NULL);
+
+    /* We /really/ mean PFN here, even for non-translated guests. */
+    pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+
+    /*
+     * Values with the MSB set denote MFNs that aren't really part of the 
+     * domain's pseudo-physical memory map (e.g., the shared info frame).
+     * Nothing to do here...
+     */
+    if ( unlikely(!VALID_M2P(pfn)) )
+        return;
+
+    /* N.B. Can use non-atomic TAS because protected by shadow2_lock. */
+    if ( likely(pfn < d->arch.shadow_dirty_bitmap_size) ) 
+    { 
+        if ( !__test_and_set_bit(pfn, d->arch.shadow_dirty_bitmap) )
+        {
+            SHADOW2_DEBUG(LOGDIRTY, 
+                          "marked mfn %" SH2_PRI_mfn " (pfn=%lx), dom %d\n",
+                          mfn_x(gmfn), pfn, d->domain_id);
+            d->arch.shadow_dirty_count++;
+        }
+    }
+    else
+    {
+        SHADOW2_PRINTK("mark_dirty OOR! "
+                       "mfn=%" SH2_PRI_mfn " pfn=%lx max=%x (dom %d)\n"
+                       "owner=%d c=%08x t=%" PRtype_info "\n",
+                       mfn_x(gmfn), 
+                       pfn, 
+                       d->arch.shadow_dirty_bitmap_size,
+                       d->domain_id,
+                       (page_get_owner(mfn_to_page(gmfn))
+                        ? page_get_owner(mfn_to_page(gmfn))->domain_id
+                        : -1),
+                       mfn_to_page(gmfn)->count_info, 
+                       mfn_to_page(gmfn)->u.inuse.type_info);
+    }
+}
+
+
+/**************************************************************************/
+/* Shadow-control DOM0_OP dispatcher */
+
+int shadow2_control_op(struct domain *d, 
+                       dom0_shadow_control_t *sc,
+                       XEN_GUEST_HANDLE(dom0_op_t) u_dom0_op)
+{
+    int rc, preempted = 0;
+
+    if ( unlikely(d == current->domain) )
+    {
+        DPRINTK("Don't try to do a shadow op on yourself!\n");
+        return -EINVAL;
+    }
+
+    switch ( sc->op )
+    {
+    case DOM0_SHADOW_CONTROL_OP_OFF:
+        if ( shadow2_mode_log_dirty(d) )
+            if ( (rc = shadow2_log_dirty_disable(d)) != 0 ) 
+                return rc;
+        if ( d->arch.shadow2_mode & SHM2_enable )
+            if ( (rc = shadow2_test_disable(d)) != 0 ) 
+                return rc;
+        return 0;
+
+    case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
+        return shadow2_test_enable(d);
+        
+    case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
+        return shadow2_log_dirty_enable(d);
+        
+    case DOM0_SHADOW_CONTROL_OP_FLUSH:
+    case DOM0_SHADOW_CONTROL_OP_CLEAN:
+    case DOM0_SHADOW_CONTROL_OP_PEEK:
+        return shadow2_log_dirty_op(d, sc);
+
+
+
+    case DOM0_SHADOW2_CONTROL_OP_ENABLE:
+        return shadow2_enable(d, sc->mode << SHM2_shift);        
+
+    case DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION:
+        sc->mb = shadow2_get_allocation(d);
+        return 0;
+        
+    case DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION:
+        rc = shadow2_set_allocation(d, sc->mb, &preempted);
+        if ( preempted )
+            /* Not finished.  Set up to re-run the call. */
+            rc = hypercall_create_continuation(
+                __HYPERVISOR_dom0_op, "h", u_dom0_op);
+        else 
+            /* Finished.  Return the new allocation */
+            sc->mb = shadow2_get_allocation(d);
+        return rc;
+        
+        
+    default:
+        SHADOW2_ERROR("Bad shadow op %u\n", sc->op);
+        return -EINVAL;
+    }
+}
+
+
+/**************************************************************************/
+/* Auditing shadow tables */
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL
+
+void shadow2_audit_tables(struct vcpu *v) 
+{
+    /* Dispatch table for getting per-type functions */
+    static hash_callback_t callbacks[16] = {
+        NULL, /* none    */
+#if CONFIG_PAGING_LEVELS == 2
+        SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,2,2),  /* l1_32   */
+        SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,2,2), /* fl1_32  */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,2,2),  /* l2_32   */
+#else 
+        SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,2),  /* l1_32   */
+        SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,2), /* fl1_32  */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,2),  /* l2_32   */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,3),  /* l1_pae  */
+        SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,3), /* fl1_pae */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3),  /* l2_pae  */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3),  /* l2h_pae */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,3,3),  /* l3_pae  */
+#if CONFIG_PAGING_LEVELS >= 4
+        SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,4,4),  /* l1_64   */
+        SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,4,4), /* fl1_64  */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,4,4),  /* l2_64   */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,4,4),  /* l3_64   */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l4_table,4,4),  /* l4_64   */
+#endif /* CONFIG_PAGING_LEVELS >= 4 */
+#endif /* CONFIG_PAGING_LEVELS > 2 */
+        NULL  /* All the rest */
+    };
+    unsigned int mask; 
+
+    if ( !(SHADOW2_AUDIT_ENABLE) )
+        return;
+    
+    if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL )
+        mask = ~1; /* Audit every table in the system */
+    else 
+    {
+        /* Audit only the current mode's tables */
+        switch (v->arch.shadow2->guest_levels)
+        {
+        case 2: mask = (SH2F_L1_32|SH2F_FL1_32|SH2F_L2_32); break;
+        case 3: mask = (SH2F_L1_PAE|SH2F_FL1_PAE|SH2F_L2_PAE
+                        |SH2F_L2H_PAE|SH2F_L3_PAE); break;
+        case 4: mask = (SH2F_L1_64|SH2F_FL1_64|SH2F_L2_64  
+                        |SH2F_L3_64|SH2F_L4_64); break;
+        default: BUG();
+        }
+    }
+
+    hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
+}
+
+#endif /* Shadow audit */
+
+
+/**************************************************************************/
+/* Auditing p2m tables */
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_P2M
+
+void shadow2_audit_p2m(struct domain *d)
+{
+    struct list_head *entry;
+    struct page_info *page;
+    struct domain *od;
+    unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
+    mfn_t p2mfn;
+    unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
+    int test_linear;
+    
+    if ( !(SHADOW2_AUDIT_ENABLE) || !shadow2_mode_translate(d) )
+        return;
+
+    //SHADOW2_PRINTK("p2m audit starts\n");
+
+    test_linear = ( (d == current->domain) && current->arch.monitor_vtable );
+    if ( test_linear )
+        local_flush_tlb(); 
+
+    /* Audit part one: walk the domain's page allocation list, checking 
+     * the m2p entries. */
+    for ( entry = d->page_list.next;
+          entry != &d->page_list;
+          entry = entry->next )
+    {
+        page = list_entry(entry, struct page_info, list);
+        mfn = mfn_x(page_to_mfn(page));
+
+        // SHADOW2_PRINTK("auditing guest page, mfn=%#lx\n", mfn); 
+
+        od = page_get_owner(page);
+
+        if ( od != d ) 
+        {
+            SHADOW2_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
+                           mfn, od, (od?od->domain_id:-1), d, d->domain_id);
+            continue;
+        }
+
+        gfn = get_gpfn_from_mfn(mfn);
+        if ( gfn == INVALID_M2P_ENTRY ) 
+        {
+            orphans_i++;
+            //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
+            //               mfn); 
+            continue;
+        }
+
+        if ( gfn == 0x55555555 ) 
+        {
+            orphans_d++;
+            //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n", 
+            //               mfn); 
+            continue;
+        }
+
+        p2mfn = sh2_gfn_to_mfn_foreign(d, gfn);
+        if ( mfn_x(p2mfn) != mfn )
+        {
+            mpbad++;
+            SHADOW2_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
+                           " (-> gfn %#lx)\n",
+                           mfn, gfn, mfn_x(p2mfn),
+                           (mfn_valid(p2mfn)
+                            ? get_gpfn_from_mfn(mfn_x(p2mfn))
+                            : -1u));
+            /* This m2p entry is stale: the domain has another frame in
+             * this physical slot.  No great disaster, but for neatness,
+             * blow away the m2p entry. */ 
+            set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+        }
+
+        if ( test_linear )
+        {
+            lp2mfn = get_mfn_from_gpfn(gfn);
+            if ( lp2mfn != mfn_x(p2mfn) )
+            {
+                SHADOW2_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
+                               "(!= mfn %#lx)\n", gfn, lp2mfn, p2mfn);
+            }
+        }
+
+        // SHADOW2_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n", 
+        //                mfn, gfn, p2mfn, lp2mfn); 
+    }   
+
+    /* Audit part two: walk the domain's p2m table, checking the entries. */
+    if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
+    {
+        l2_pgentry_t *l2e;
+        l1_pgentry_t *l1e;
+        int i1, i2;
+        
+#if CONFIG_PAGING_LEVELS == 4
+        l4_pgentry_t *l4e;
+        l3_pgentry_t *l3e;
+        int i3, i4;
+        l4e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#elif CONFIG_PAGING_LEVELS == 3
+        l3_pgentry_t *l3e;
+        int i3;
+        l3e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#else /* CONFIG_PAGING_LEVELS == 2 */
+        l2e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#endif
+
+        gfn = 0;
+#if CONFIG_PAGING_LEVELS >= 3
+#if CONFIG_PAGING_LEVELS >= 4
+        for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
+        {
+            if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
+            {
+                gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
+                continue;
+            }
+            l3e = sh2_map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
+#endif /* now at levels 3 or 4... */
+            for ( i3 = 0; 
+                  i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8); 
+                  i3++ )
+            {
+                if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
+                {
+                    gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
+                    continue;
+                }
+                l2e = sh2_map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
+#endif /* all levels... */
+                for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
+                {
+                    if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
+                    {
+                        gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
+                        continue;
+                    }
+                    l1e = sh2_map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
+                    
+                    for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
+                    {
+                        if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
+                            continue;
+                        mfn = l1e_get_pfn(l1e[i1]);
+                        ASSERT(valid_mfn(_mfn(mfn)));
+                        m2pfn = get_gpfn_from_mfn(mfn);
+                        if ( m2pfn != gfn )
+                        {
+                            pmbad++;
+                            SHADOW2_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+                                           " -> gfn %#lx\n", gfn, mfn, m2pfn);
+                            BUG();
+                        }
+                    }
+                    sh2_unmap_domain_page(l1e);
+                }
+#if CONFIG_PAGING_LEVELS >= 3
+                sh2_unmap_domain_page(l2e);
+            }
+#if CONFIG_PAGING_LEVELS >= 4
+            sh2_unmap_domain_page(l3e);
+        }
+#endif
+#endif
+
+#if CONFIG_PAGING_LEVELS == 4
+        sh2_unmap_domain_page(l4e);
+#elif CONFIG_PAGING_LEVELS == 3
+        sh2_unmap_domain_page(l3e);
+#else /* CONFIG_PAGING_LEVELS == 2 */
+        sh2_unmap_domain_page(l2e);
+#endif
+
+    }
+
+    //SHADOW2_PRINTK("p2m audit complete\n");
+    //if ( orphans_i | orphans_d | mpbad | pmbad ) 
+    //    SHADOW2_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
+    //                   orphans_i + orphans_d, orphans_i, orphans_d,
+    if ( mpbad | pmbad ) 
+        SHADOW2_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
+                       pmbad, mpbad);
+}
+
+#endif /* p2m audit */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End: 
+ */
diff --git a/xen/arch/x86/shadow2.c b/xen/arch/x86/shadow2.c
new file mode 100644 (file)
index 0000000..9d845cb
--- /dev/null
@@ -0,0 +1,4469 @@
+/******************************************************************************
+ * arch/x86/shadow2.c
+ *
+ * Simple, mostly-synchronous shadow page tables. 
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+// DESIGN QUESTIONS:
+// Why use subshadows for PAE guests?
+// - reduces pressure in the hash table
+// - reduces shadow size (64-vs-4096 bytes of shadow for 32 bytes of guest L3)
+// - would need to find space in the page_info to store 7 more bits of
+//   backpointer
+// - independent shadows of 32 byte chunks makes it non-obvious how to quickly
+//   figure out when to demote the guest page from l3 status
+//
+// PAE Xen HVM guests are restricted to 8GB of pseudo-physical address space.
+// - Want to map the P2M table into the 16MB RO_MPT hole in Xen's address
+//   space for both PV and HVM guests.
+//
+
+#define SHADOW2 1
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <xen/domain_page.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/shadow2.h>
+#include <asm/shadow2-private.h>
+#include <asm/shadow2-types.h>
+#include <asm/flushtlb.h>
+#include <asm/hvm/hvm.h>
+
+/* The first cut: an absolutely synchronous, trap-and-emulate version,
+ * supporting only HVM guests (and so only "external" shadow mode). 
+ *
+ * THINGS TO DO LATER:
+ * 
+ * FIX GVA_TO_GPA
+ * The current interface returns an unsigned long, which is not big enough
+ * to hold a physical address in PAE.  Should return a gfn instead.
+ * 
+ * TEARDOWN HEURISTICS
+ * Also: have a heuristic for when to destroy a previous paging-mode's 
+ * shadows.  When a guest is done with its start-of-day 32-bit tables
+ * and reuses the memory we want to drop those shadows.  Start with 
+ * shadows in a page in two modes as a hint, but beware of clever tricks 
+ * like reusing a pagetable for both PAE and 64-bit during boot...
+ *
+ * PAE LINEAR MAPS
+ * Rework shadow_get_l*e() to have the option of using map_domain_page()
+ * instead of linear maps.  Add appropriate unmap_l*e calls in the users. 
+ * Then we can test the speed difference made by linear maps.  If the 
+ * map_domain_page() version is OK on PAE, we could maybe allow a lightweight 
+ * l3-and-l2h-only shadow mode for PAE PV guests that would allow them 
+ * to share l2h pages again. 
+ *
+ * PAE L3 COPYING
+ * In this code, we copy all 32 bytes of a PAE L3 every time we change an 
+ * entry in it, and every time we change CR3.  We copy it for the linear 
+ * mappings (ugh! PAE linear mappings) and we copy it to the low-memory
+ * buffer so it fits in CR3.  Maybe we can avoid some of this recopying 
+ * by using the shadow directly in some places. 
+ * Also, for SMP, need to actually respond to seeing shadow2_pae_flip_pending.
+ *
+ * GUEST_WALK_TABLES TLB FLUSH COALESCE
+ * guest_walk_tables can do up to three remote TLB flushes as it walks to
+ * the first l1 of a new pagetable.  Should coalesce the flushes to the end, 
+ * and if we do flush, re-do the walk.  If anything has changed, then 
+ * pause all the other vcpus and do the walk *again*.
+ *
+ * WP DISABLED
+ * Consider how to implement having the WP bit of CR0 set to 0.  
+ * Since we need to be able to cause write faults to pagetables, this might
+ * end up looking like not having the (guest) pagetables present at all in 
+ * HVM guests...
+ *
+ * PSE disabled / PSE36
+ * We don't support any modes other than PSE enabled, PSE36 disabled.
+ * Neither of those would be hard to change, but we'd need to be able to 
+ * deal with shadows made in one mode and used in another.
+ */
+
+#define FETCH_TYPE_PREFETCH 1
+#define FETCH_TYPE_DEMAND   2
+#define FETCH_TYPE_WRITE    4
+typedef enum {
+    ft_prefetch     = FETCH_TYPE_PREFETCH,
+    ft_demand_read  = FETCH_TYPE_DEMAND,
+    ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
+} fetch_type_t;
+
+#ifndef NDEBUG
+static char *fetch_type_names[] = {
+    [ft_prefetch]     "prefetch",
+    [ft_demand_read]  "demand read",
+    [ft_demand_write] "demand write",
+};
+#endif
+
+/* XXX forward declarations */
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+static unsigned long hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res);
+#endif
+static inline void sh2_update_linear_entries(struct vcpu *v);
+
+/**************************************************************************/
+/* Hash table mapping from guest pagetables to shadows
+ *
+ * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
+ * FL1's:       maps the *gfn* of the start of a superpage to the mfn of a
+ *              shadow L1 which maps its "splinters".
+ * PAE CR3s:    maps the 32-byte aligned, 32-bit CR3 value to the mfn of the
+ *              PAE L3 info page for that CR3 value.
+ */
+
+static inline mfn_t 
+get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
+/* Look for FL1 shadows in the hash table */
+{
+    mfn_t smfn = shadow2_hash_lookup(v, gfn_x(gfn),
+                                     PGC_SH2_fl1_shadow >> PGC_SH2_type_shift);
+
+    if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
+    {
+        struct page_info *page = mfn_to_page(smfn);
+        if ( !(page->count_info & PGC_SH2_log_dirty) )
+            shadow2_convert_to_log_dirty(v, smfn);
+    }
+
+    return smfn;
+}
+
+static inline mfn_t 
+get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
+/* Look for shadows in the hash table */
+{
+    mfn_t smfn = shadow2_hash_lookup(v, mfn_x(gmfn),
+                                     shadow_type >> PGC_SH2_type_shift);
+    perfc_incrc(shadow2_get_shadow_status);
+
+    if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
+    {
+        struct page_info *page = mfn_to_page(smfn);
+        if ( !(page->count_info & PGC_SH2_log_dirty) )
+            shadow2_convert_to_log_dirty(v, smfn);
+    }
+
+    return smfn;
+}
+
+static inline void 
+set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
+/* Put an FL1 shadow into the hash table */
+{
+    SHADOW2_PRINTK("gfn=%"SH2_PRI_gfn", type=%08x, smfn=%05lx\n",
+                   gfn_x(gfn), PGC_SH2_fl1_shadow, mfn_x(smfn));
+
+    if ( unlikely(shadow2_mode_log_dirty(v->domain)) )
+        // mark this shadow as a log dirty shadow...
+        set_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info);
+    else
+        clear_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info);
+
+    shadow2_hash_insert(v, gfn_x(gfn),
+                        PGC_SH2_fl1_shadow >> PGC_SH2_type_shift, smfn);
+}
+
+static inline void 
+set_shadow2_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
+/* Put a shadow into the hash table */
+{
+    struct domain *d = v->domain;
+    int res;
+
+    SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
+                   d->domain_id, v->vcpu_id, mfn_x(gmfn),
+                   shadow_type, mfn_x(smfn));
+
+    if ( unlikely(shadow2_mode_log_dirty(d)) )
+        // mark this shadow as a log dirty shadow...
+        set_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info);
+    else
+        clear_bit(_PGC_SH2_log_dirty, &mfn_to_page(smfn)->count_info);
+
+    res = get_page(mfn_to_page(gmfn), d);
+    ASSERT(res == 1);
+
+    shadow2_hash_insert(v, mfn_x(gmfn), shadow_type >> PGC_SH2_type_shift,
+                        smfn);
+}
+
+static inline void 
+delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn)
+/* Remove a shadow from the hash table */
+{
+    SHADOW2_PRINTK("gfn=%"SH2_PRI_gfn", type=%08x, smfn=%05lx\n",
+                   gfn_x(gfn), PGC_SH2_fl1_shadow, mfn_x(smfn));
+
+    shadow2_hash_delete(v, gfn_x(gfn),
+                        PGC_SH2_fl1_shadow >> PGC_SH2_type_shift, smfn);
+}
+
+static inline void 
+delete_shadow2_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
+/* Remove a shadow from the hash table */
+{
+    SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n",
+                   v->domain->domain_id, v->vcpu_id,
+                   mfn_x(gmfn), shadow_type, mfn_x(smfn));
+    shadow2_hash_delete(v, mfn_x(gmfn),
+                        shadow_type >> PGC_SH2_type_shift, smfn);
+    put_page(mfn_to_page(gmfn));
+}
+
+
+/**************************************************************************/
+/* Functions for walking the guest page tables */
+
+
+/* Walk the guest pagetables, filling the walk_t with what we see. 
+ * Takes an uninitialised walk_t.  The caller must call unmap_walk() 
+ * on the walk_t before discarding it or calling guest_walk_tables again. 
+ * If "guest_op" is non-zero, we are serving a genuine guest memory access, 
+ * and must (a) be under the shadow2 lock, and (b) remove write access
+ * from any gueat PT pages we see, as we will be using their contents to 
+ * perform shadow updates.
+ * Returns 0 for success or non-zero if the guest pagetables are malformed.
+ * N.B. Finding a not-present entry does not cause a non-zero return code. */
+static inline int 
+guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op)
+{
+    ASSERT(!guest_op || shadow2_lock_is_acquired(v->domain));
+
+    perfc_incrc(shadow2_guest_walk);
+    memset(gw, 0, sizeof(*gw));
+    gw->va = va;
+
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+    /* Get l4e from the top level table */
+    gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
+    gw->l4e = (guest_l4e_t *)v->arch.guest_vtable + guest_l4_table_offset(va);
+    /* Walk down to the l3e */
+    if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0;
+    gw->l3mfn = vcpu_gfn_to_mfn(v, guest_l4e_get_gfn(*gw->l4e));
+    if ( !valid_mfn(gw->l3mfn) ) return 1;
+    /* This mfn is a pagetable: make sure the guest can't write to it. */
+    if ( guest_op && shadow2_remove_write_access(v, gw->l3mfn, 3, va) != 0 )
+        flush_tlb_mask(v->domain->domain_dirty_cpumask); 
+    gw->l3e = ((guest_l3e_t *)sh2_map_domain_page(gw->l3mfn))
+        + guest_l3_table_offset(va);
+#else /* PAE only... */
+    /* Get l3e from the top level table */
+    gw->l3mfn = pagetable_get_mfn(v->arch.guest_table);
+    gw->l3e = (guest_l3e_t *)v->arch.guest_vtable + guest_l3_table_offset(va);
+#endif /* PAE or 64... */
+    /* Walk down to the l2e */
+    if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0;
+    gw->l2mfn = vcpu_gfn_to_mfn(v, guest_l3e_get_gfn(*gw->l3e));
+    if ( !valid_mfn(gw->l2mfn) ) return 1;
+    /* This mfn is a pagetable: make sure the guest can't write to it. */
+    if ( guest_op && shadow2_remove_write_access(v, gw->l2mfn, 2, va) != 0 )
+        flush_tlb_mask(v->domain->domain_dirty_cpumask); 
+    gw->l2e = ((guest_l2e_t *)sh2_map_domain_page(gw->l2mfn))
+        + guest_l2_table_offset(va);
+#else /* 32-bit only... */
+    /* Get l2e from the top level table */
+    gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
+    gw->l2e = (guest_l2e_t *)v->arch.guest_vtable + guest_l2_table_offset(va);
+#endif /* All levels... */
+    
+    if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0;
+    if ( guest_supports_superpages(v) &&
+         (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) ) 
+    {
+        /* Special case: this guest VA is in a PSE superpage, so there's
+         * no guest l1e.  We make one up so that the propagation code
+         * can generate a shadow l1 table.  Start with the gfn of the 
+         * first 4k-page of the superpage. */
+        gfn_t start = guest_l2e_get_gfn(*gw->l2e);
+        /* Grant full access in the l1e, since all the guest entry's 
+         * access controls are enforced in the shadow l2e.  This lets 
+         * us reflect l2 changes later without touching the l1s. */
+        int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
+                     _PAGE_ACCESSED|_PAGE_DIRTY);
+        /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
+         * of the level 1 */
+        if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) ) 
+            flags |= _PAGE_PAT; 
+        /* Increment the pfn by the right number of 4k pages.  
+         * The ~0x1 is to mask out the PAT bit mentioned above. */
+        start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
+        gw->eff_l1e = guest_l1e_from_gfn(start, flags);
+        gw->l1e = NULL;
+        gw->l1mfn = _mfn(INVALID_MFN);
+    } 
+    else 
+    {
+        /* Not a superpage: carry on and find the l1e. */
+        gw->l1mfn = vcpu_gfn_to_mfn(v, guest_l2e_get_gfn(*gw->l2e));
+        if ( !valid_mfn(gw->l1mfn) ) return 1;
+        /* This mfn is a pagetable: make sure the guest can't write to it. */
+        if ( guest_op 
+             && shadow2_remove_write_access(v, gw->l1mfn, 1, va) != 0 )
+            flush_tlb_mask(v->domain->domain_dirty_cpumask); 
+        gw->l1e = ((guest_l1e_t *)sh2_map_domain_page(gw->l1mfn))
+            + guest_l1_table_offset(va);
+        gw->eff_l1e = *gw->l1e;
+    }
+
+    return 0;
+}
+
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding frame number. */
+static inline gfn_t
+guest_walk_to_gfn(walk_t *gw)
+{
+    if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
+        return _gfn(INVALID_GFN);
+    return guest_l1e_get_gfn(gw->eff_l1e);
+}
+
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding physical address. */
+static inline paddr_t
+guest_walk_to_gpa(walk_t *gw)
+{
+    if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) )
+        return 0;
+    return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK);
+}
+
+
+/* Unmap (and reinitialise) a guest walk.  
+ * Call this to dispose of any walk filled in by guest_walk_tables() */
+static void unmap_walk(struct vcpu *v, walk_t *gw)
+{
+#if GUEST_PAGING_LEVELS >= 3
+#if GUEST_PAGING_LEVELS >= 4
+    if ( gw->l3e != NULL ) sh2_unmap_domain_page(gw->l3e);
+#endif
+    if ( gw->l2e != NULL ) sh2_unmap_domain_page(gw->l2e);
+#endif
+    if ( gw->l1e != NULL ) sh2_unmap_domain_page(gw->l1e);
+#ifdef DEBUG
+    memset(gw, 0, sizeof(*gw));
+#endif
+}
+
+
+/* Pretty-print the contents of a guest-walk */
+static inline void print_gw(walk_t *gw)
+{
+    SHADOW2_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+    SHADOW2_PRINTK("   l4mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l4mfn));
+    SHADOW2_PRINTK("   l4e=%p\n", gw->l4e);
+    if ( gw->l4e )
+        SHADOW2_PRINTK("   *l4e=%" SH2_PRI_gpte "\n", gw->l4e->l4);
+#endif /* PAE or 64... */
+    SHADOW2_PRINTK("   l3mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l3mfn));
+    SHADOW2_PRINTK("   l3e=%p\n", gw->l3e);
+    if ( gw->l3e )
+        SHADOW2_PRINTK("   *l3e=%" SH2_PRI_gpte "\n", gw->l3e->l3);
+#endif /* All levels... */
+    SHADOW2_PRINTK("   l2mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l2mfn));
+    SHADOW2_PRINTK("   l2e=%p\n", gw->l2e);
+    if ( gw->l2e )
+        SHADOW2_PRINTK("   *l2e=%" SH2_PRI_gpte "\n", gw->l2e->l2);
+    SHADOW2_PRINTK("   l1mfn=%" SH2_PRI_mfn "\n", mfn_x(gw->l1mfn));
+    SHADOW2_PRINTK("   l1e=%p\n", gw->l1e);
+    if ( gw->l1e )
+        SHADOW2_PRINTK("   *l1e=%" SH2_PRI_gpte "\n", gw->l1e->l1);
+    SHADOW2_PRINTK("   eff_l1e=%" SH2_PRI_gpte "\n", gw->eff_l1e.l1);
+}
+
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES
+/* Lightweight audit: pass all the shadows associated with this guest walk
+ * through the audit mechanisms */
+static void sh2_audit_gw(struct vcpu *v, walk_t *gw) 
+{
+    mfn_t smfn;
+
+    if ( !(SHADOW2_AUDIT_ENABLE) )
+        return;
+
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+    if ( valid_mfn(gw->l4mfn)
+         && valid_mfn((smfn = get_shadow_status(v, gw->l4mfn, 
+                                                PGC_SH2_l4_shadow))) )
+        (void) sh2_audit_l4_table(v, smfn, _mfn(INVALID_MFN));
+#endif /* PAE or 64... */
+    if ( valid_mfn(gw->l3mfn)
+         && valid_mfn((smfn = get_shadow_status(v, gw->l3mfn, 
+                                                PGC_SH2_l3_shadow))) )
+        (void) sh2_audit_l3_table(v, smfn, _mfn(INVALID_MFN));
+#endif /* All levels... */
+    if ( valid_mfn(gw->l2mfn) )
+    {
+        if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn, 
+                                                 PGC_SH2_l2_shadow))) )
+            (void) sh2_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
+#if GUEST_PAGING_LEVELS == 3
+        if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn, 
+                                                 PGC_SH2_l2h_shadow))) )
+            (void) sh2_audit_l2_table(v, smfn, _mfn(INVALID_MFN));
+#endif
+    }
+    if ( valid_mfn(gw->l1mfn)
+         && valid_mfn((smfn = get_shadow_status(v, gw->l1mfn, 
+                                                PGC_SH2_l1_shadow))) )
+        (void) sh2_audit_l1_table(v, smfn, _mfn(INVALID_MFN));
+    else if ( gw->l2e
+              && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE)
+              && valid_mfn( 
+              (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) )
+        (void) sh2_audit_fl1_table(v, smfn, _mfn(INVALID_MFN));
+}
+
+#else
+#define sh2_audit_gw(_v, _gw) do {} while(0)
+#endif /* audit code */
+
+
+
+/**************************************************************************/
+/* Function to write to the guest tables, for propagating accessed and 
+ * dirty bits from the shadow to the guest.
+ * Takes a guest mfn, a pointer to the guest entry, the level of pagetable,
+ * and an operation type.  The guest entry is always passed as an l1e: 
+ * since we only ever write flags, that's OK.
+ * Returns the new flag bits of the guest entry. */
+
+static u32 guest_set_ad_bits(struct vcpu *v,
+                             mfn_t gmfn, 
+                             guest_l1e_t *ep,
+                             unsigned int level, 
+                             fetch_type_t ft)
+{
+    u32 flags, shflags, bit;
+    struct page_info *pg;
+    int res = 0;
+
+    ASSERT(valid_mfn(gmfn)
+           && (sh2_mfn_is_a_page_table(gmfn)
+               || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) 
+                   == 0)));
+    ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1)));
+    ASSERT(level <= GUEST_PAGING_LEVELS);
+    ASSERT(ft == ft_demand_read || ft == ft_demand_write);
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+
+    flags = guest_l1e_get_flags(*ep);
+
+    /* PAE l3s do not have A and D bits */
+    if ( unlikely(GUEST_PAGING_LEVELS == 3 && level == 3) )
+        return flags;
+
+    /* Need the D bit as well for writes, in l1es and PSE l2es. */
+    if ( ft == ft_demand_write  
+         && (level == 1 || (level == 2 && (flags & _PAGE_PSE))) )
+    {
+        if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) 
+             == (_PAGE_DIRTY | _PAGE_ACCESSED) )
+            return flags;  /* Guest already has A and D bits set */
+        flags |= _PAGE_DIRTY | _PAGE_ACCESSED;
+        perfc_incrc(shadow2_ad_update);
+    }
+    else 
+    {
+        if ( flags & _PAGE_ACCESSED )
+            return flags;  /* Guest already has A bit set */
+        flags |= _PAGE_ACCESSED;
+        perfc_incrc(shadow2_a_update);
+    }
+
+    /* Set the bit(s) */
+    sh2_mark_dirty(v->domain, gmfn);
+    SHADOW2_DEBUG(A_AND_D, "gfn = %"SH2_PRI_gfn", "
+                  "old flags = %#x, new flags = %#x\n", 
+                  guest_l1e_get_gfn(*ep), guest_l1e_get_flags(*ep), flags);
+    *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags);
+    
+    /* May need to propagate this change forward to other kinds of shadow */
+    pg = mfn_to_page(gmfn);
+    if ( !sh2_mfn_is_a_page_table(gmfn) ) 
+    {
+        /* This guest pagetable is not yet shadowed at all. */
+        // MAF: I think this assert is busted...  If this gmfn has not yet
+        // been promoted, then it seems perfectly reasonable for there to be
+        // outstanding type refs to it...
+        /* TJD: No. If the gmfn has not been promoted, we must at least 
+         * have recognised that it is a pagetable, and pulled write access.
+         * The type count should only be non-zero if it is actually a page 
+         * table.  The test above was incorrect, though, so I've fixed it. */
+        ASSERT((pg->u.inuse.type_info & PGT_count_mask) == 0);
+        return flags;  
+    }
+
+    shflags = pg->shadow2_flags & SH2F_page_type_mask;
+    while ( shflags )
+    {
+        bit = find_first_set_bit(shflags);
+        ASSERT(shflags & (1u << bit));
+        shflags &= ~(1u << bit);
+        if ( !(pg->shadow2_flags & (1u << bit)) )
+            continue;
+        switch ( bit )
+        {
+        case PGC_SH2_type_to_index(PGC_SH2_l1_shadow):
+            if (level != 1) 
+                res |= sh2_map_and_validate_gl1e(v, gmfn, ep, sizeof (*ep));
+            break;
+        case PGC_SH2_type_to_index(PGC_SH2_l2_shadow):
+            if (level != 2) 
+                res |= sh2_map_and_validate_gl2e(v, gmfn, ep, sizeof (*ep));
+            break;
+#if GUEST_PAGING_LEVELS == 3 /* PAE only */
+        case PGC_SH2_type_to_index(PGC_SH2_l2h_shadow):
+            if (level != 2) 
+                res |= sh2_map_and_validate_gl2he(v, gmfn, ep, sizeof (*ep));
+            break;
+#endif
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+        case PGC_SH2_type_to_index(PGC_SH2_l3_shadow):
+            if (level != 3) 
+                res |= sh2_map_and_validate_gl3e(v, gmfn, ep, sizeof (*ep));
+            break;
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+        case PGC_SH2_type_to_index(PGC_SH2_l4_shadow):
+            if (level != 4) 
+                res |= sh2_map_and_validate_gl4e(v, gmfn, ep, sizeof (*ep));
+            break;
+#endif 
+#endif
+        default:
+            SHADOW2_ERROR("mfn %"SH2_PRI_mfn" is shadowed in multiple "
+                          "modes: A&D bits may be out of sync (flags=%#x).\n", 
+                          mfn_x(gmfn), pg->shadow2_flags); 
+            /* XXX Shadows in other modes will not be updated, so will
+             * have their A and D bits out of sync. */
+        }
+    }
+    
+    /* We should never need to flush the TLB or recopy PAE entries */
+    ASSERT( res == 0 || res == SHADOW2_SET_CHANGED );
+    return flags;
+}
+
+/**************************************************************************/
+/* Functions to compute the correct index into a shadow page, given an
+ * index into the guest page (as returned by guest_get_index()).
+ * This is trivial when the shadow and guest use the same sized PTEs, but
+ * gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
+ * PAE- or 64-bit shadows).
+ *
+ * These functions also increment the shadow mfn, when necessary.  When PTE
+ * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
+ * page.  In this case, we allocate 2 contiguous pages for the shadow L1, and
+ * use simple pointer arithmetic on a pointer to the guest L1e to figure out
+ * which shadow page we really want.  Similarly, when PTE sizes are
+ * mismatched, we shadow a guest L2 page with 4 shadow L2 pages.  (The easiest
+ * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
+ * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
+ * space.)
+ *
+ * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes
+ * of shadow (to store both the shadow, and the info that would normally be
+ * stored in page_info fields).  This arrangement allows the shadow and the
+ * "page_info" fields to always be stored in the same page (in fact, in
+ * the same cache line), avoiding an extra call to map_domain_page().
+ */
+
+static inline u32
+guest_index(void *ptr)
+{
+    return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
+}
+
+static inline u32
+shadow_l1_index(mfn_t *smfn, u32 guest_index)
+{
+#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
+    *smfn = _mfn(mfn_x(*smfn) +
+                 (guest_index / SHADOW_L1_PAGETABLE_ENTRIES));
+    return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
+#else
+    return guest_index;
+#endif
+}
+
+static inline u32
+shadow_l2_index(mfn_t *smfn, u32 guest_index)
+{
+#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
+    // Because we use 2 shadow l2 entries for each guest entry, the number of
+    // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
+    //
+    *smfn = _mfn(mfn_x(*smfn) +
+                 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
+
+    // We multiple by two to get the index of the first of the two entries
+    // used to shadow the specified guest entry.
+    return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
+#else
+    return guest_index;
+#endif
+}
+
+#if GUEST_PAGING_LEVELS >= 3
+
+static inline u32
+shadow_l3_index(mfn_t *smfn, u32 guest_index)
+{
+#if GUEST_PAGING_LEVELS == 3
+    u32 group_id;
+
+    // Because we use twice the space in L3 shadows as was consumed in guest
+    // L3s, the number of guest entries per shadow page is
+    // SHADOW_L2_PAGETABLE_ENTRIES/2.  (Note this is *not*
+    // SHADOW_L3_PAGETABLE_ENTRIES, which in this case is 4...)
+    //
+    *smfn = _mfn(mfn_x(*smfn) +
+                 (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2)));
+
+    // We store PAE L3 shadows in groups of 4, alternating shadows and
+    // pae_l3_bookkeeping structs.  So the effective shadow index is
+    // the the group_id * 8 + the offset within the group.
+    //
+    guest_index %= (SHADOW_L2_PAGETABLE_ENTRIES / 2);
+    group_id = guest_index / 4;
+    return (group_id * 8) + (guest_index % 4);
+#else
+    return guest_index;
+#endif
+}
+
+#endif // GUEST_PAGING_LEVELS >= 3
+
+#if GUEST_PAGING_LEVELS >= 4
+
+static inline u32
+shadow_l4_index(mfn_t *smfn, u32 guest_index)
+{
+    return guest_index;
+}
+
+#endif // GUEST_PAGING_LEVELS >= 4
+
+
+/**************************************************************************/
+/* Functions which compute shadow entries from their corresponding guest
+ * entries.
+ *
+ * These are the "heart" of the shadow code.
+ *
+ * There are two sets of these: those that are called on demand faults (read
+ * faults and write faults), and those that are essentially called to
+ * "prefetch" (or propagate) entries from the guest into the shadow.  The read
+ * fault and write fault are handled as two separate cases for L1 entries (due
+ * to the _PAGE_DIRTY bit handling), but for L[234], they are grouped together
+ * into the respective demand_fault functions.
+ */
+
+#define CHECK(_cond)                                    \
+do {                                                    \
+    if (unlikely(!(_cond)))                             \
+    {                                                   \
+        printk("%s %s %d ASSERTION (%s) FAILED\n",      \
+               __func__, __FILE__, __LINE__, #_cond);   \
+        return -1;                                      \
+    }                                                   \
+} while (0);
+
+// The function below tries to capture all of the flag manipulation for the
+// demand and propagate functions into one place.
+//
+static always_inline u32
+sh2_propagate_flags(struct vcpu *v, mfn_t target_mfn, 
+                    u32 gflags, guest_l1e_t *guest_entry_ptr, mfn_t gmfn, 
+                    int mmio, int level, fetch_type_t ft)
+{
+    struct domain *d = v->domain;
+    u32 pass_thru_flags;
+    u32 sflags;
+    int lowest_level_guest_mapping;
+
+    // XXX -- might want to think about PAT support for HVM guests...
+
+#ifndef NDEBUG
+    // MMIO can only occur from L1e's
+    //
+    if ( mmio )
+        CHECK(level == 1);
+
+    // We should always have a pointer to the guest entry if it's a non-PSE
+    // non-MMIO demand access.
+    if ( ft & FETCH_TYPE_DEMAND )
+        CHECK(guest_entry_ptr || level == 1);
+#endif
+
+    // A not-present guest entry has a special signature in the shadow table,
+    // so that we do not have to consult the guest tables multiple times...
+    //
+    if ( unlikely(!(gflags & _PAGE_PRESENT)) )
+        return _PAGE_SHADOW_GUEST_NOT_PRESENT;
+
+    // Must have a valid target_mfn, unless this is mmio, or unless this is a
+    // prefetch.  In the case of a prefetch, an invalid mfn means that we can
+    // not usefully shadow anything, and so we return early.
+    //
+    if ( !valid_mfn(target_mfn) )
+    {
+        CHECK((ft == ft_prefetch) || mmio);
+        if ( !mmio )
+            return 0;
+    }
+
+    // PAE does not allow NX, RW, USER, ACCESSED, or DIRTY bits in its L3e's...
+    //
+    if ( (SHADOW_PAGING_LEVELS == 3) && (level == 3) )
+        pass_thru_flags = _PAGE_PRESENT;
+    else
+    {
+        pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER |
+                           _PAGE_RW | _PAGE_PRESENT);
+        if ( guest_supports_nx(v) )
+            pass_thru_flags |= _PAGE_NX_BIT;
+    }
+
+    // PAE guests can not put NX, RW, USER, ACCESSED, or DIRTY bits into their
+    // L3e's; they are all implied.  So we emulate them here.
+    //
+    if ( (GUEST_PAGING_LEVELS == 3) && (level == 3) )
+        gflags = pass_thru_flags;
+
+    // Propagate bits from the guest to the shadow.
+    // Some of these may be overwritten, below.
+    // Since we know the guest's PRESENT bit is set, we also set the shadow's
+    // SHADOW_PRESENT bit.
+    //
+    sflags = (gflags & pass_thru_flags) | _PAGE_SHADOW_PRESENT;
+
+    // Copy the guest's RW bit into the SHADOW_RW bit.
+    //
+    if ( gflags & _PAGE_RW )
+        sflags |= _PAGE_SHADOW_RW;
+
+    // Set the A&D bits for higher level shadows.
+    // Higher level entries do not, strictly speaking, have dirty bits, but
+    // since we use shadow linear tables, each of these entries may, at some
+    // point in time, also serve as a shadow L1 entry.
+    // By setting both the  A&D bits in each of these, we eliminate the burden
+    // on the hardware to update these bits on initial accesses.
+    //
+    if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
+        sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
+
+    lowest_level_guest_mapping =
+        ((level == 1) ||
+         ((level == 2) && guest_supports_superpages(v) &&
+          (gflags & _PAGE_PSE)));
+
+    // Set the A and D bits in the guest entry, if we need to.
+    if ( guest_entry_ptr && (ft & FETCH_TYPE_DEMAND) )
+        gflags = guest_set_ad_bits(v, gmfn, guest_entry_ptr, level, ft);
+    
+    // If the A or D bit has not yet been set in the guest, then we must
+    // prevent the corresponding kind of access.
+    //
+    if ( unlikely(!((GUEST_PAGING_LEVELS == 3) && (level == 3)) &&
+                  !(gflags & _PAGE_ACCESSED)) )
+        sflags &= ~_PAGE_PRESENT;
+
+    if ( unlikely(lowest_level_guest_mapping &&
+                  !(gflags & _PAGE_DIRTY)) )
+        sflags &= ~_PAGE_RW;
+
+    // MMIO caching
+    //
+    // MMIO mappings are marked as not present, but we set the SHADOW_MMIO bit
+    // to cache the fact that this entry  is in MMIO space.
+    //
+    if ( (level == 1) && mmio )
+    {
+        sflags &= ~(_PAGE_PRESENT);
+        sflags |= _PAGE_SHADOW_MMIO;
+    }
+    else 
+    {
+        // shadow2_mode_log_dirty support
+        //
+        // Only allow the guest write access to a page a) on a demand fault,
+        // or b) if the page is already marked as dirty.
+        //
+        if ( unlikely((level == 1) &&
+                      !(ft & FETCH_TYPE_WRITE) &&
+                      shadow2_mode_log_dirty(d) &&
+                      !sh2_mfn_is_dirty(d, target_mfn)) )
+        {
+            sflags &= ~_PAGE_RW;
+        }
+        
+        // protect guest page tables
+        //
+        if ( unlikely((level == 1) &&
+                      sh2_mfn_is_a_page_table(target_mfn)) )
+        {
+            if ( shadow2_mode_trap_reads(d) )
+            {
+                // if we are trapping both reads & writes, then mark this page
+                // as not present...
+                //
+                sflags &= ~_PAGE_PRESENT;
+            }
+            else
+            {
+                // otherwise, just prevent any writes...
+                //
+                sflags &= ~_PAGE_RW;
+            }
+        }
+    }
+
+    return sflags;
+}
+
+#undef CHECK
+
+#if GUEST_PAGING_LEVELS >= 4
+static void
+l4e_propagate_from_guest(struct vcpu *v, 
+                         guest_l4e_t *gl4e,
+                         mfn_t gl4mfn,
+                         mfn_t sl3mfn,
+                         shadow_l4e_t *sl4p,
+                         fetch_type_t ft)
+{
+    u32 gflags = guest_l4e_get_flags(*gl4e);
+    u32 sflags = sh2_propagate_flags(v, sl3mfn, gflags, (guest_l1e_t *) gl4e,
+                                     gl4mfn, 0, 4, ft);
+
+    *sl4p = shadow_l4e_from_mfn(sl3mfn, sflags);
+
+    SHADOW2_DEBUG(PROPAGATE,
+                  "%s gl4e=%" SH2_PRI_gpte " sl4e=%" SH2_PRI_pte "\n",
+                  fetch_type_names[ft], gl4e->l4, sl4p->l4);
+    ASSERT(sflags != -1);
+}
+#endif // GUEST_PAGING_LEVELS >= 4
+
+#if GUEST_PAGING_LEVELS >= 3
+static void
+l3e_propagate_from_guest(struct vcpu *v,
+                         guest_l3e_t *gl3e,
+                         mfn_t gl3mfn, 
+                         mfn_t sl2mfn, 
+                         shadow_l3e_t *sl3p,
+                         fetch_type_t ft)
+{
+    u32 gflags = guest_l3e_get_flags(*gl3e);
+    u32 sflags = sh2_propagate_flags(v, sl2mfn, gflags, (guest_l1e_t *) gl3e,
+                                     gl3mfn, 0, 3, ft);
+
+    *sl3p = shadow_l3e_from_mfn(sl2mfn, sflags);
+
+    SHADOW2_DEBUG(PROPAGATE,
+                  "%s gl3e=%" SH2_PRI_gpte " sl3e=%" SH2_PRI_pte "\n",
+                  fetch_type_names[ft], gl3e->l3, sl3p->l3);
+    ASSERT(sflags != -1);
+}
+#endif // GUEST_PAGING_LEVELS >= 3
+
+static void
+l2e_propagate_from_guest(struct vcpu *v, 
+                         guest_l2e_t *gl2e,
+                         mfn_t gl2mfn,
+                         mfn_t sl1mfn, 
+                         shadow_l2e_t *sl2p,
+                         fetch_type_t ft)
+{
+    u32 gflags = guest_l2e_get_flags(*gl2e);
+    u32 sflags = sh2_propagate_flags(v, sl1mfn, gflags, (guest_l1e_t *) gl2e, 
+                                     gl2mfn, 0, 2, ft);
+
+    *sl2p = shadow_l2e_from_mfn(sl1mfn, sflags);
+
+    SHADOW2_DEBUG(PROPAGATE,
+                  "%s gl2e=%" SH2_PRI_gpte " sl2e=%" SH2_PRI_pte "\n",
+                  fetch_type_names[ft], gl2e->l2, sl2p->l2);
+    ASSERT(sflags != -1);
+}
+
+static inline int
+l1e_read_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
+               int mmio)
+/* returns 1 if emulation is required, and 0 otherwise */
+{
+    struct domain *d = v->domain;
+    u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
+    u32 sflags = sh2_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
+                                     mmio, 1, ft_demand_read);
+
+    if ( shadow2_mode_trap_reads(d) && !mmio && sh2_mfn_is_a_page_table(gmfn) )
+    {
+        // emulation required!
+        *sl1p = shadow_l1e_empty();
+        return 1;
+    }
+
+    *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
+
+    SHADOW2_DEBUG(PROPAGATE,
+                  "va=%p eff_gl1e=%" SH2_PRI_gpte " sl1e=%" SH2_PRI_pte "\n",
+                  (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
+
+    ASSERT(sflags != -1);
+    return 0;
+}
+
+static inline int
+l1e_write_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p,
+                int mmio)
+/* returns 1 if emulation is required, and 0 otherwise */
+{
+    struct domain *d = v->domain;
+    u32 gflags = guest_l1e_get_flags(gw->eff_l1e);
+    u32 sflags = sh2_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn,
+                                     mmio, 1, ft_demand_write);
+
+    sh2_mark_dirty(d, gmfn);
+
+    if ( !mmio && sh2_mfn_is_a_page_table(gmfn) )
+    {
+        // emulation required!
+        *sl1p = shadow_l1e_empty();
+        return 1;
+    }
+
+    *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
+
+    SHADOW2_DEBUG(PROPAGATE,
+                  "va=%p eff_gl1e=%" SH2_PRI_gpte " sl1e=%" SH2_PRI_pte "\n",
+                  (void *)gw->va, gw->eff_l1e.l1, sl1p->l1);
+
+    ASSERT(sflags != -1);
+    return 0;
+}
+
+static inline void
+l1e_propagate_from_guest(struct vcpu *v, guest_l1e_t gl1e, shadow_l1e_t *sl1p,
+                         int mmio)
+{
+    gfn_t gfn = guest_l1e_get_gfn(gl1e);
+    mfn_t gmfn = (mmio) ? _mfn(gfn_x(gfn)) : vcpu_gfn_to_mfn(v, gfn);
+    u32 gflags = guest_l1e_get_flags(gl1e);
+    u32 sflags = sh2_propagate_flags(v, gmfn, gflags, 0, _mfn(INVALID_MFN), 
+                                     mmio, 1, ft_prefetch);
+
+    *sl1p = shadow_l1e_from_mfn(gmfn, sflags);
+
+    SHADOW2_DEBUG(PROPAGATE,
+                  "gl1e=%" SH2_PRI_gpte " sl1e=%" SH2_PRI_pte "\n",
+                  gl1e.l1, sl1p->l1);
+
+    ASSERT(sflags != -1);
+}
+
+
+/**************************************************************************/
+/* These functions update shadow entries (and do bookkeeping on the shadow
+ * tables they are in).  It is intended that they are the only
+ * functions which ever write (non-zero) data onto a shadow page.
+ *
+ * They return a set of flags: 
+ * SHADOW2_SET_CHANGED -- we actually wrote a new value to the shadow.
+ * SHADOW2_SET_FLUSH   -- the caller must cause a TLB flush.
+ * SHADOW2_SET_ERROR   -- the input is not a valid entry (for example, if
+ *                        shadow2_get_page_from_l1e() fails).
+ * SHADOW2_SET_L3PAE_RECOPY -- one or more vcpu's need to have their local
+ *                             copies of their PAE L3 entries re-copied.
+ */
+
+static inline void safe_write_entry(void *dst, void *src) 
+/* Copy one PTE safely when processors might be running on the
+ * destination pagetable.   This does *not* give safety against
+ * concurrent writes (that's what the shadow lock is for), just 
+ * stops the hardware picking up partially written entries. */
+{
+    volatile unsigned long *d = dst;
+    unsigned long *s = src;
+    ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
+#if CONFIG_PAGING_LEVELS == 3
+    /* In PAE mode, pagetable entries are larger
+     * than machine words, so won't get written atomically.  We need to make
+     * sure any other cpu running on these shadows doesn't see a
+     * half-written entry.  Do this by marking the entry not-present first,
+     * then writing the high word before the low word. */
+    BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long));
+    d[0] = 0;
+    d[1] = s[1];
+    d[0] = s[0];
+#else
+    /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
+     * which will be an atomic write, since the entry is aligned. */
+    BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
+    *d = *s;
+#endif
+}
+
+
+static inline void 
+shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
+/* This function does the actual writes to shadow pages.
+ * It must not be called directly, since it doesn't do the bookkeeping
+ * that shadow_set_l*e() functions do. */
+{
+    shadow_l1e_t *dst = d;
+    shadow_l1e_t *src = s;
+    void *map = NULL;
+    int i;
+
+    /* Because we mirror access rights at all levels in the shadow, an
+     * l2 (or higher) entry with the RW bit cleared will leave us with
+     * no write access through the linear map.  
+     * We detect that by writing to the shadow with copy_to_user() and 
+     * using map_domain_page() to get a writeable mapping if we need to. */
+    if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 ) 
+    {
+        perfc_incrc(shadow2_linear_map_failed);
+        map = sh2_map_domain_page(mfn);
+        ASSERT(map != NULL);
+        dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
+    }
+
+
+    for ( i = 0; i < entries; i++ )
+        safe_write_entry(dst++, src++);
+
+    if ( map != NULL ) sh2_unmap_domain_page(map);
+
+    /* XXX TODO:
+     * Update min/max field in page_info struct of this mfn */
+}
+
+static inline int
+perms_strictly_increased(u32 old_flags, u32 new_flags) 
+/* Given the flags of two entries, are the new flags a strict
+ * increase in rights over the old ones? */
+{
+    u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
+    u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX);
+    /* Flip the NX bit, since it's the only one that decreases rights;
+     * we calculate as if it were an "X" bit. */
+    of ^= _PAGE_NX_BIT;
+    nf ^= _PAGE_NX_BIT;
+    /* If the changed bits are all set in the new flags, then rights strictly 
+     * increased between old and new. */
+    return ((of | (of ^ nf)) == nf);
+}
+
+static int inline
+shadow2_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
+{
+    int res;
+    mfn_t mfn;
+    struct domain *owner;
+    shadow_l1e_t sanitized_sl1e =
+        shadow_l1e_remove_flags(sl1e, _PAGE_SHADOW_RW | _PAGE_SHADOW_PRESENT);
+
+    //ASSERT(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT);
+    //ASSERT((shadow_l1e_get_flags(sl1e) & L1_DISALLOW_MASK) == 0);
+
+    if ( !shadow2_mode_refcounts(d) )
+        return 1;
+
+    res = get_page_from_l1e(sanitized_sl1e, d);
+
+    // If a privileged domain is attempting to install a map of a page it does
+    // not own, we let it succeed anyway.
+    //
+    if ( unlikely(!res) &&
+         IS_PRIV(d) &&
+         !shadow2_mode_translate(d) &&
+         valid_mfn(mfn = shadow_l1e_get_mfn(sl1e)) &&
+         (owner = page_get_owner(mfn_to_page(mfn))) &&
+         (d != owner) )
+    {
+        res = get_page_from_l1e(sanitized_sl1e, owner);
+        SHADOW2_PRINTK("privileged domain %d installs map of mfn %05lx "
+                       "which is owned by domain %d: %s\n",
+                       d->domain_id, mfn_x(mfn), owner->domain_id,
+                       res ? "success" : "failed");
+    }
+
+    if ( unlikely(!res) )
+    {
+        perfc_incrc(shadow2_get_page_fail);
+        SHADOW2_PRINTK("failed: l1e=" SH2_PRI_pte "\n");
+    }
+
+    return res;
+}
+
+static void inline
+shadow2_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
+{ 
+    if ( !shadow2_mode_refcounts(d) )
+        return;
+
+    put_page_from_l1e(sl1e, d);
+}
+
+#if GUEST_PAGING_LEVELS >= 4
+static int shadow_set_l4e(struct vcpu *v, 
+                          shadow_l4e_t *sl4e, 
+                          shadow_l4e_t new_sl4e, 
+                          mfn_t sl4mfn)
+{
+    int flags = 0;
+    shadow_l4e_t old_sl4e;
+    paddr_t paddr;
+    ASSERT(sl4e != NULL);
+    old_sl4e = *sl4e;
+
+    if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
+    
+    paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) 
+             | (((unsigned long)sl4e) & ~PAGE_MASK));
+
+    if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT ) 
+    {
+        /* About to install a new reference */        
+        sh2_get_ref(shadow_l4e_get_mfn(new_sl4e), paddr);
+    } 
+
+    /* Write the new entry */
+    shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
+    flags |= SHADOW2_SET_CHANGED;
+
+    if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT ) 
+    {
+        /* We lost a reference to an old mfn. */
+        mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
+        if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
+             || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e), 
+                                          shadow_l4e_get_flags(new_sl4e)) )
+        {
+            flags |= SHADOW2_SET_FLUSH;
+        }
+        sh2_put_ref(v, osl3mfn, paddr);
+    }
+    return flags;
+}
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+#if GUEST_PAGING_LEVELS >= 3
+static int shadow_set_l3e(struct vcpu *v, 
+                          shadow_l3e_t *sl3e, 
+                          shadow_l3e_t new_sl3e, 
+                          mfn_t sl3mfn)
+{
+    int flags = 0;
+    shadow_l3e_t old_sl3e;
+    paddr_t paddr;
+    ASSERT(sl3e != NULL);
+    old_sl3e = *sl3e;
+
+    if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
+
+    paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT) 
+             | (((unsigned long)sl3e) & ~PAGE_MASK));
+    
+    if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT ) 
+    {
+        /* About to install a new reference */        
+        sh2_get_ref(shadow_l3e_get_mfn(new_sl3e), paddr);
+    } 
+
+    /* Write the new entry */
+    shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
+    flags |= SHADOW2_SET_CHANGED;
+
+#if GUEST_PAGING_LEVELS == 3 
+    /* We wrote a guest l3e in a PAE pagetable.  This table is copied in
+     * the linear pagetable entries of its l2s, and may also be copied
+     * to a low memory location to make it fit in CR3.  Report that we
+     * need to resync those copies (we can't wait for the guest to flush
+     * the TLB because it might be an increase in rights). */
+    {
+        struct vcpu *vcpu;
+
+        struct pae_l3_bookkeeping *info = sl3p_to_info(sl3e);
+        for_each_vcpu(v->domain, vcpu)
+        {
+            if (info->vcpus & (1 << vcpu->vcpu_id))
+            {
+                // Remember that this flip/update needs to occur.
+                vcpu->arch.shadow2_pae_flip_pending = 1;
+                flags |= SHADOW2_SET_L3PAE_RECOPY;
+            }
+        }
+    }
+#endif
+
+    if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT ) 
+    {
+        /* We lost a reference to an old mfn. */
+        mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e);
+        if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) ||
+             !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e), 
+                                       shadow_l3e_get_flags(new_sl3e)) ) 
+        {
+            flags |= SHADOW2_SET_FLUSH;
+        }
+        sh2_put_ref(v, osl2mfn, paddr);
+    }
+    return flags;
+}
+#endif /* GUEST_PAGING_LEVELS >= 3 */ 
+
+static int shadow_set_l2e(struct vcpu *v, 
+                          shadow_l2e_t *sl2e, 
+                          shadow_l2e_t new_sl2e, 
+                          mfn_t sl2mfn)
+{
+    int flags = 0;
+    shadow_l2e_t old_sl2e;
+    paddr_t paddr;
+
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+    /* In 2-on-3 we work with pairs of l2es pointing at two-page
+     * shadows.  Reference counting and up-pointers track from the first
+     * page of the shadow to the first l2e, so make sure that we're 
+     * working with those:     
+     * Align the pointer down so it's pointing at the first of the pair */
+    sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t)));
+    /* Align the mfn of the shadow entry too */
+    new_sl2e.l2 &= ~(1<<PAGE_SHIFT);
+#endif
+
+    ASSERT(sl2e != NULL);
+    old_sl2e = *sl2e;
+    
+    if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */
+    
+    paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT)
+             | (((unsigned long)sl2e) & ~PAGE_MASK));
+
+    if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT ) 
+    {
+        /* About to install a new reference */
+        sh2_get_ref(shadow_l2e_get_mfn(new_sl2e), paddr);
+    } 
+
+    /* Write the new entry */
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+    {
+        shadow_l2e_t pair[2] = { new_sl2e, new_sl2e };
+        /* The l1 shadow is two pages long and need to be pointed to by
+         * two adjacent l1es.  The pair have the same flags, but point
+         * at odd and even MFNs */
+        ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT)));
+        pair[1].l2 |= (1<<PAGE_SHIFT);
+        shadow_write_entries(sl2e, &pair, 2, sl2mfn);
+    }
+#else /* normal case */
+    shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn);
+#endif
+    flags |= SHADOW2_SET_CHANGED;
+
+    if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT ) 
+    {
+        /* We lost a reference to an old mfn. */
+        mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e);
+        if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) ||
+             !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e), 
+                                       shadow_l2e_get_flags(new_sl2e)) ) 
+        {
+            flags |= SHADOW2_SET_FLUSH;
+        }
+        sh2_put_ref(v, osl1mfn, paddr);
+    }
+    return flags;
+}
+
+static int shadow_set_l1e(struct vcpu *v, 
+                          shadow_l1e_t *sl1e, 
+                          shadow_l1e_t new_sl1e,
+                          mfn_t sl1mfn)
+{
+    int flags = 0;
+    struct domain *d = v->domain;
+    shadow_l1e_t old_sl1e;
+    ASSERT(sl1e != NULL);
+    
+    old_sl1e = *sl1e;
+
+    if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */
+    
+    if ( shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT ) 
+    {
+        /* About to install a new reference */        
+        if ( shadow2_mode_refcounts(d) ) {
+            if ( shadow2_get_page_from_l1e(new_sl1e, d) == 0 ) 
+            {
+                /* Doesn't look like a pagetable. */
+                flags |= SHADOW2_SET_ERROR;
+                new_sl1e = shadow_l1e_empty();
+            }
+        }
+    } 
+
+    /* Write the new entry */
+    shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn);
+    flags |= SHADOW2_SET_CHANGED;
+
+    if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT ) 
+    {
+        /* We lost a reference to an old mfn. */
+        /* N.B. Unlike higher-level sets, never need an extra flush 
+         * when writing an l1e.  Because it points to the same guest frame 
+         * as the guest l1e did, it's the guest's responsibility to
+         * trigger a flush later. */
+        if ( shadow2_mode_refcounts(d) ) 
+        {
+            shadow2_put_page_from_l1e(old_sl1e, d);
+        } 
+    }
+    return flags;
+}
+
+
+/**************************************************************************/
+/* These functions take a vcpu and a virtual address, and return a pointer
+ * to the appropriate level N entry from the shadow tables.  
+ * If the necessary tables are not present in the shadow, they return NULL. */
+
+/* N.B. The use of GUEST_PAGING_LEVELS here is correct.  If the shadow has
+ * more levels than the guest, the upper levels are always fixed and do not 
+ * reflect any information from the guest, so we do not use these functions 
+ * to access them. */
+
+#if GUEST_PAGING_LEVELS >= 4
+static shadow_l4e_t *
+shadow_get_l4e(struct vcpu *v, unsigned long va)
+{
+    /* Reading the top level table is always valid. */
+    return sh2_linear_l4_table(v) + shadow_l4_linear_offset(va);
+}
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+
+#if GUEST_PAGING_LEVELS >= 3
+static shadow_l3e_t *
+shadow_get_l3e(struct vcpu *v, unsigned long va)
+{
+#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
+    /* Get the l4 */
+    shadow_l4e_t *sl4e = shadow_get_l4e(v, va);
+    ASSERT(sl4e != NULL);
+    if ( !(shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT) )
+        return NULL;
+    ASSERT(valid_mfn(shadow_l4e_get_mfn(*sl4e)));
+    /* l4 was present; OK to get the l3 */
+    return sh2_linear_l3_table(v) + shadow_l3_linear_offset(va);
+#else /* PAE... */
+    /* Top level is always mapped */
+    ASSERT(v->arch.shadow_vtable);
+    return ((shadow_l3e_t *)v->arch.shadow_vtable) + shadow_l3_linear_offset(va);
+#endif 
+}
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+
+static shadow_l2e_t *
+shadow_get_l2e(struct vcpu *v, unsigned long va)
+{
+#if GUEST_PAGING_LEVELS >= 3  /* 64bit/PAE... */
+    /* Get the l3 */
+    shadow_l3e_t *sl3e = shadow_get_l3e(v, va);
+    if ( sl3e == NULL || !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) )
+        return NULL;
+    ASSERT(valid_mfn(shadow_l3e_get_mfn(*sl3e)));
+    /* l3 was present; OK to get the l2 */
+#endif
+    return sh2_linear_l2_table(v) + shadow_l2_linear_offset(va);
+}
+
+
+#if 0 // avoid the compiler warning for now...
+
+static shadow_l1e_t *
+shadow_get_l1e(struct vcpu *v, unsigned long va)
+{
+    /* Get the l2 */
+    shadow_l2e_t *sl2e = shadow_get_l2e(v, va);
+    if ( sl2e == NULL || !(shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT) )
+        return NULL;
+    ASSERT(valid_mfn(shadow_l2e_get_mfn(*sl2e)));
+    /* l2 was present; OK to get the l1 */
+    return sh2_linear_l1_table(v) + shadow_l1_linear_offset(va);
+}
+
+#endif
+
+
+/**************************************************************************/
+/* Macros to walk pagetables.  These take the shadow of a pagetable and 
+ * walk every "interesting" entry.  That is, they don't touch Xen mappings, 
+ * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every 
+ * second entry (since pairs of entries are managed together). For multi-page
+ * shadows they walk all pages.
+ * 
+ * Arguments are an MFN, the variable to point to each entry, a variable 
+ * to indicate that we are done (we will shortcut to the end of the scan 
+ * when _done != 0), a variable to indicate that we should avoid Xen mappings,
+ * and the code. 
+ *
+ * WARNING: These macros have side-effects.  They change the values of both 
+ * the pointer and the MFN. */ 
+
+static inline void increment_ptr_to_guest_entry(void *ptr)
+{
+    if ( ptr )
+    {
+        guest_l1e_t **entry = ptr;
+        (*entry)++;
+    }
+}
+
+/* All kinds of l1: touch all entries */
+#define _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)       \
+do {                                                                    \
+    int _i;                                                             \
+    shadow_l1e_t *_sp = map_shadow_page((_sl1mfn));                     \
+    ASSERT((mfn_to_page(_sl1mfn)->count_info & PGC_SH2_type_mask)       \
+           == PGC_SH2_l1_shadow                                         \
+           || (mfn_to_page(_sl1mfn)->count_info & PGC_SH2_type_mask)    \
+           == PGC_SH2_fl1_shadow);                                      \
+    for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ )              \
+    {                                                                   \
+        (_sl1e) = _sp + _i;                                             \
+        if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT )           \
+            {_code}                                                     \
+        if ( _done ) break;                                             \
+        increment_ptr_to_guest_entry(_gl1p);                            \
+    }                                                                   \
+    unmap_shadow_page(_sp);                                             \
+} while (0)
+
+/* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+#define SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done,  _code)       \
+do {                                                                    \
+    int __done = 0;                                                     \
+    _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p,                         \
+                         ({ (__done = _done); }), _code);               \
+    _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1);                                 \
+    if ( !__done )                                                      \
+        _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p,                     \
+                             ({ (__done = _done); }), _code);           \
+} while (0)
+#else /* Everything else; l1 shadows are only one page */
+#define SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)        \
+       _SHADOW2_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code)
+#endif
+    
+
+#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2
+
+/* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */
+#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code)    \
+do {                                                                      \
+    int _i, _j, __done = 0;                                               \
+    ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask)         \
+           == PGC_SH2_l2_32_shadow);                                      \
+    for ( _j = 0; _j < 4 && !__done; _j++ )                               \
+    {                                                                     \
+        shadow_l2e_t *_sp = map_shadow_page(_sl2mfn);                     \
+        for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 )         \
+            if ( (!(_xen))                                                \
+                 || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i)             \
+                 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \
+            {                                                             \
+                (_sl2e) = _sp + _i;                                       \
+                if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT )     \
+                    {_code}                                               \
+                if ( (__done = (_done)) ) break;                          \
+                increment_ptr_to_guest_entry(_gl2p);                      \
+            }                                                             \
+        unmap_shadow_page(_sp);                                           \
+        _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1);                               \
+    }                                                                     \
+} while (0)
+
+#elif GUEST_PAGING_LEVELS == 2
+
+/* 32-bit on 32-bit: avoid Xen entries */
+#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code)     \
+do {                                                                       \
+    int _i;                                                                \
+    shadow_l2e_t *_sp = map_shadow_page((_sl2mfn));                        \
+    ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask)          \
+           == PGC_SH2_l2_32_shadow);                                       \
+    for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ )                 \
+        if ( (!(_xen))                                                     \
+             ||                                                            \
+             (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
+        {                                                                  \
+            (_sl2e) = _sp + _i;                                            \
+            if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT )          \
+                {_code}                                                    \
+            if ( _done ) break;                                            \
+            increment_ptr_to_guest_entry(_gl2p);                           \
+        }                                                                  \
+    unmap_shadow_page(_sp);                                                \
+} while (0)
+
+#elif GUEST_PAGING_LEVELS == 3
+
+/* PAE: if it's an l2h, don't touch Xen mappings */
+#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code)     \
+do {                                                                       \
+    int _i;                                                                \
+    shadow_l2e_t *_sp = map_shadow_page((_sl2mfn));                        \
+    ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask)          \
+           == PGC_SH2_l2_pae_shadow                                        \
+           || (mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask)       \
+           == PGC_SH2_l2h_pae_shadow);                                     \
+    for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ )                 \
+        if ( (!(_xen))                                                     \
+             || ((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask)    \
+                 != PGC_SH2_l2h_pae_shadow)                                \
+             || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES))                  \
+                 < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \
+        {                                                                  \
+            (_sl2e) = _sp + _i;                                            \
+            if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT )          \
+                {_code}                                                    \
+            if ( _done ) break;                                            \
+            increment_ptr_to_guest_entry(_gl2p);                           \
+        }                                                                  \
+    unmap_shadow_page(_sp);                                                \
+} while (0)
+
+#else 
+
+/* 64-bit l2: touch all entries */
+#define SHADOW2_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code)  \
+do {                                                                    \
+    int _i;                                                             \
+    shadow_l2e_t *_sp = map_shadow_page((_sl2mfn));                     \
+    ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH2_type_mask)       \
+           == PGC_SH2_l2_64_shadow);                                    \
+    for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ )              \
+    {                                                                   \
+        (_sl2e) = _sp + _i;                                             \
+        if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT )           \
+            {_code}                                                     \
+        if ( _done ) break;                                             \
+        increment_ptr_to_guest_entry(_gl2p);                            \
+    }                                                                   \
+    unmap_shadow_page(_sp);                                             \
+} while (0)
+
+#endif /* different kinds of l2 */
+
+#if GUEST_PAGING_LEVELS == 3
+
+/* PAE l3 subshadow: touch all entries (FOREACH_L2E will find Xen l2es). */
+#define SHADOW2_FOREACH_L3E_SUB(_sl3e, _gl3p, _done, _code)             \
+do {                                                                    \
+    int _i;                                                             \
+    for ( _i = 0; _i < 4; _i++ )                                        \
+    {                                                                   \
+        if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT )           \
+            {_code}                                                     \
+        if ( _done ) break;                                             \
+        _sl3e++;                                                        \
+        increment_ptr_to_guest_entry(_gl3p);                            \
+    }                                                                   \
+} while (0)
+
+/* PAE l3 full shadow: call subshadow walk on all valid l3 subshadows */
+#define SHADOW2_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code)        \
+do {                                                                    \
+    int _i, _j, _k, __done = 0;                                         \
+    ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH2_type_mask)       \
+           == PGC_SH2_l3_pae_shadow);                                   \
+    /* The subshadows are split, 64 on each page of the shadow */       \
+    for ( _j = 0; _j < 2 && !__done; _j++ )                             \
+    {                                                                   \
+        void *_sp = sh2_map_domain_page(_sl3mfn);                       \
+        for ( _i = 0; _i < 64; _i++ )                                   \
+        {                                                               \
+            /* Every second 32-byte region is a bookkeeping entry */    \
+            _sl3e = (shadow_l3e_t *)(_sp + (64 * _i));                  \
+            if ( (sl3p_to_info(_sl3e))->refcount > 0 )                  \
+                SHADOW2_FOREACH_L3E_SUB(_sl3e, _gl3p,                   \
+                                        ({ __done = (_done); __done; }), \
+                                        _code);                         \
+            else                                                        \
+                for ( _k = 0 ; _k < 4 ; _k++ )                          \
+                    increment_ptr_to_guest_entry(_gl3p);                \
+            if ( __done ) break;                                        \
+        }                                                               \
+        sh2_unmap_domain_page(_sp);                                     \
+        _sl3mfn = _mfn(mfn_x(_sl3mfn) + 1);                             \
+    }                                                                   \
+} while (0)
+
+#elif GUEST_PAGING_LEVELS == 4
+
+/* 64-bit l3: touch all entries */
+#define SHADOW2_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code)        \
+do {                                                                    \
+    int _i;                                                             \
+    shadow_l3e_t *_sp = map_shadow_page((_sl3mfn));                     \
+    ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH2_type_mask)       \
+           == PGC_SH2_l3_64_shadow);                                    \
+    for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ )              \
+    {                                                                   \
+        (_sl3e) = _sp + _i;                                             \
+        if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT )           \
+            {_code}                                                     \
+        if ( _done ) break;                                             \
+        increment_ptr_to_guest_entry(_gl3p);                            \
+    }                                                                   \
+    unmap_shadow_page(_sp);                                             \
+} while (0)
+
+/* 64-bit l4: avoid Xen mappings */
+#define SHADOW2_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _xen, _code)  \
+do {                                                                    \
+    int _i;                                                             \
+    shadow_l4e_t *_sp = map_shadow_page((_sl4mfn));                     \
+    ASSERT((mfn_to_page(_sl4mfn)->count_info & PGC_SH2_type_mask)       \
+           == PGC_SH2_l4_64_shadow);                                    \
+    for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ )              \
+    {                                                                   \
+        if ( (!(_xen)) || is_guest_l4_slot(_i) )                        \
+        {                                                               \
+            (_sl4e) = _sp + _i;                                         \
+            if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT )       \
+                {_code}                                                 \
+            if ( _done ) break;                                         \
+        }                                                               \
+        increment_ptr_to_guest_entry(_gl4p);                            \
+    }                                                                   \
+    unmap_shadow_page(_sp);                                             \
+} while (0)
+
+#endif
+
+
+
+/**************************************************************************/
+/* Functions to install Xen mappings and linear mappings in shadow pages */
+
+static mfn_t sh2_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type);
+
+// XXX -- this function should probably be moved to shadow2-common.c, but that
+//        probably wants to wait until the shadow types have been moved from
+//        shadow2-types.h to shadow2-private.h
+//
+#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
+void sh2_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn)
+{
+    struct domain *d = v->domain;
+    shadow_l4e_t *sl4e;
+
+    sl4e = sh2_map_domain_page(sl4mfn);
+    ASSERT(sl4e != NULL);
+    ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t));
+    
+    /* Copy the common Xen mappings from the idle domain */
+    memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT],
+           &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT],
+           ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t));
+
+    /* Install the per-domain mappings for this domain */
+    sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] =
+        shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)),
+                            __PAGE_HYPERVISOR);
+
+    /* Linear mapping */
+    sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] =
+        shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR);
+    sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
+        shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR);
+
+    if ( shadow2_mode_translate(v->domain) )
+    {
+        /* install domain-specific P2M table */
+        sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] =
+            shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
+                                __PAGE_HYPERVISOR);
+    }
+
+    sh2_unmap_domain_page(sl4e);    
+}
+#endif
+
+#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
+// For 3-on-3 PV guests, we need to make sure the xen mappings are in
+// place, which means that we need to populate the l2h entry in the l3
+// table.
+
+void sh2_install_xen_entries_in_l2h(struct vcpu *v, 
+                                    mfn_t sl2hmfn)
+{
+    struct domain *d = v->domain;
+    shadow_l2e_t *sl2e;
+    int i;
+
+    sl2e = sh2_map_domain_page(sl2hmfn);
+    ASSERT(sl2e != NULL);
+    ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
+    
+    /* Copy the common Xen mappings from the idle domain */
+    memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
+           &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
+           L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
+
+    /* Install the per-domain mappings for this domain */
+    for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
+        sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
+            shadow_l2e_from_mfn(
+                page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
+                __PAGE_HYPERVISOR);
+    
+    /* We don't set up a linear mapping here because we can't until this
+     * l2h is installed in an l3e.  sh2_update_linear_entries() handles
+     * the linear mappings when the l3 is loaded. */
+
+    if ( shadow2_mode_translate(d) )
+    {
+        /* Install the domain-specific p2m table */
+        l3_pgentry_t *p2m;
+        ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
+        p2m = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+        for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ )
+        {
+            sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] =
+                shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])),
+                                    __PAGE_HYPERVISOR);
+        }
+        sh2_unmap_domain_page(p2m);
+    }
+    
+    sh2_unmap_domain_page(sl2e);
+}
+
+void sh2_install_xen_entries_in_l3(struct vcpu *v, mfn_t gl3mfn, mfn_t sl3mfn)
+{
+    shadow_l3e_t *sl3e;
+    guest_l3e_t *gl3e = v->arch.guest_vtable;
+    shadow_l3e_t new_sl3e;
+    gfn_t l2gfn;
+    mfn_t l2gmfn, l2smfn;
+    int r;
+
+    ASSERT(!shadow2_mode_external(v->domain));
+    ASSERT(guest_l3e_get_flags(gl3e[3]) & _PAGE_PRESENT);
+    l2gfn = guest_l3e_get_gfn(gl3e[3]);
+    l2gmfn = sh2_gfn_to_mfn(v->domain, gfn_x(l2gfn));
+    l2smfn = get_shadow_status(v, l2gmfn, PGC_SH2_l2h_shadow);
+    if ( !valid_mfn(l2smfn) )
+    {
+        l2smfn = sh2_make_shadow(v, l2gmfn, PGC_SH2_l2h_shadow);
+    }
+    l3e_propagate_from_guest(v, &gl3e[3], gl3mfn, l2smfn, &new_sl3e,
+                             ft_prefetch);
+    sl3e = sh2_map_domain_page(sl3mfn);
+    r = shadow_set_l3e(v, &sl3e[3], new_sl3e, sl3mfn);
+    sh2_unmap_domain_page(sl3e);
+}
+#endif
+
+
+#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
+void sh2_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn)
+{
+    struct domain *d = v->domain;
+    shadow_l2e_t *sl2e;
+    int i;
+
+    sl2e = sh2_map_domain_page(sl2mfn);
+    ASSERT(sl2e != NULL);
+    ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t));
+    
+    /* Copy the common Xen mappings from the idle domain */
+    memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT],
+           &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT],
+           L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
+
+    /* Install the per-domain mappings for this domain */
+    for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
+        sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] =
+            shadow_l2e_from_mfn(
+                page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i),
+                __PAGE_HYPERVISOR);
+
+    /* Linear mapping */
+    sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] =
+        shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR);
+    sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
+        shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR);
+
+    if ( shadow2_mode_translate(d) )
+    {
+        /* install domain-specific P2M table */
+        sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] =
+            shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table),
+                                __PAGE_HYPERVISOR);
+    }
+
+    sh2_unmap_domain_page(sl2e);
+}
+#endif
+
+
+
+
+
+/**************************************************************************/
+/* Create a shadow of a given guest page.
+ */
+static mfn_t
+sh2_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
+{
+    mfn_t smfn = shadow2_alloc(v->domain, shadow_type, mfn_x(gmfn));
+    SHADOW2_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n",
+                  mfn_x(gmfn), shadow_type, mfn_x(smfn));
+
+    if ( shadow_type != PGC_SH2_guest_root_type )
+        /* Lower-level shadow, not yet linked form a higher level */
+        mfn_to_page(smfn)->up = 0;
+
+    // Create the Xen mappings...
+    if ( !shadow2_mode_external(v->domain) )
+    {
+        switch (shadow_type) 
+        {
+#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4
+        case PGC_SH2_l4_shadow:
+            sh2_install_xen_entries_in_l4(v, gmfn, smfn); break;
+#endif
+#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3
+        case PGC_SH2_l3_shadow:
+            sh2_install_xen_entries_in_l3(v, gmfn, smfn); break;
+        case PGC_SH2_l2h_shadow:
+            sh2_install_xen_entries_in_l2h(v, smfn); break;
+#endif
+#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2
+        case PGC_SH2_l2_shadow:
+            sh2_install_xen_entries_in_l2(v, gmfn, smfn); break;
+#endif
+        default: /* Do nothing */ break;
+        }
+    }
+    
+    shadow2_promote(v, gmfn, shadow_type);
+    set_shadow2_status(v, gmfn, shadow_type, smfn);
+
+    return smfn;
+}
+
+/* Make a splintered superpage shadow */
+static mfn_t
+make_fl1_shadow(struct vcpu *v, gfn_t gfn)
+{
+    mfn_t smfn = shadow2_alloc(v->domain, PGC_SH2_fl1_shadow,
+                               (unsigned long) gfn_x(gfn));
+
+    SHADOW2_DEBUG(MAKE_SHADOW, "(%" SH2_PRI_gfn ")=>%" SH2_PRI_mfn "\n",
+                  gfn_x(gfn), mfn_x(smfn));
+
+    set_fl1_shadow_status(v, gfn, smfn);
+    return smfn;
+}
+
+
+#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
+mfn_t
+sh2_make_monitor_table(struct vcpu *v)
+{
+
+    ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0);
+    
+#if CONFIG_PAGING_LEVELS == 4    
+    {
+        struct domain *d = v->domain;
+        mfn_t m4mfn;
+        m4mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
+        sh2_install_xen_entries_in_l4(v, m4mfn, m4mfn);
+        /* Remember the level of this table */
+        mfn_to_page(m4mfn)->shadow2_flags = 4;
+#if SHADOW_PAGING_LEVELS < 4
+        // Install a monitor l3 table in slot 0 of the l4 table.
+        // This is used for shadow linear maps.
+        {
+            mfn_t m3mfn; 
+            l4_pgentry_t *l4e;
+            m3mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
+            mfn_to_page(m3mfn)->shadow2_flags = 3;
+            l4e = sh2_map_domain_page(m4mfn);
+            l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR);
+            sh2_unmap_domain_page(l4e);
+        }
+#endif /* SHADOW_PAGING_LEVELS < 4 */
+        return m4mfn;
+    }
+
+#elif CONFIG_PAGING_LEVELS == 3
+
+    {
+        struct domain *d = v->domain;
+        mfn_t m3mfn, m2mfn; 
+        l3_pgentry_t *l3e;
+        l2_pgentry_t *l2e;
+        int i;
+
+        m3mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
+        /* Remember the level of this table */
+        mfn_to_page(m3mfn)->shadow2_flags = 3;
+
+        // Install a monitor l2 table in slot 3 of the l3 table.
+        // This is used for all Xen entries, including linear maps
+        m2mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
+        mfn_to_page(m2mfn)->shadow2_flags = 2;
+        l3e = sh2_map_domain_page(m3mfn);
+        l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT);
+        sh2_install_xen_entries_in_l2h(v, m2mfn);
+        /* Install the monitor's own linear map */
+        l2e = sh2_map_domain_page(m2mfn);
+        for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
+            l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
+                (l3e_get_flags(l3e[i]) & _PAGE_PRESENT) 
+                ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR) 
+                : l2e_empty();
+        sh2_unmap_domain_page(l2e);
+        sh2_unmap_domain_page(l3e);
+
+        SHADOW2_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn));
+        return m3mfn;
+    }
+
+#elif CONFIG_PAGING_LEVELS == 2
+
+    {
+        struct domain *d = v->domain;
+        mfn_t m2mfn;
+        m2mfn = shadow2_alloc(d, PGC_SH2_monitor_table, 0);
+        sh2_install_xen_entries_in_l2(v, m2mfn, m2mfn);
+        /* Remember the level of this table */
+        mfn_to_page(m2mfn)->shadow2_flags = 2;
+        return m2mfn;
+    }
+
+#else
+#error this should not happen
+#endif /* CONFIG_PAGING_LEVELS */
+}
+#endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */
+
+/**************************************************************************/
+/* These functions also take a virtual address and return the level-N
+ * shadow table mfn and entry, but they create the shadow pagetables if
+ * they are needed.  The "demand" argument is non-zero when handling
+ * a demand fault (so we know what to do about accessed bits &c).
+ * If the necessary tables are not present in the guest, they return NULL. */
+#if GUEST_PAGING_LEVELS >= 4
+static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v, 
+                                                walk_t *gw, 
+                                                mfn_t *sl4mfn)
+{
+    /* There is always a shadow of the top level table.  Get it. */
+    *sl4mfn = pagetable_get_mfn(v->arch.shadow_table);
+    /* Reading the top level table is always valid. */
+    return sh2_linear_l4_table(v) + shadow_l4_linear_offset(gw->va);
+}
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+
+#if GUEST_PAGING_LEVELS >= 3
+static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v, 
+                                                walk_t *gw, 
+                                                mfn_t *sl3mfn,
+                                                fetch_type_t ft)
+{
+#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */
+    mfn_t sl4mfn;
+    shadow_l4e_t *sl4e;
+    if ( !valid_mfn(gw->l3mfn) ) return NULL; /* No guest page. */
+    /* Get the l4e */
+    sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn);
+    ASSERT(sl4e != NULL);
+    if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) 
+    {
+        *sl3mfn = shadow_l4e_get_mfn(*sl4e);
+        ASSERT(valid_mfn(*sl3mfn));
+    } 
+    else 
+    {
+        int r;
+        shadow_l4e_t new_sl4e;
+        /* No l3 shadow installed: find and install it. */
+        *sl3mfn = get_shadow_status(v, gw->l3mfn, PGC_SH2_l3_shadow);
+        if ( !valid_mfn(*sl3mfn) ) 
+        {
+            /* No l3 shadow of this page exists at all: make one. */
+            *sl3mfn = sh2_make_shadow(v, gw->l3mfn, PGC_SH2_l3_shadow);
+        }
+        /* Install the new sl3 table in the sl4e */
+        l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn, 
+                                 *sl3mfn, &new_sl4e, ft);
+        r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn);
+        ASSERT((r & SHADOW2_SET_FLUSH) == 0);
+    }
+    /* Now follow it down a level.  Guaranteed to succeed. */
+    return sh2_linear_l3_table(v) + shadow_l3_linear_offset(gw->va);
+#else /* PAE... */
+    /* There is always a shadow of the top level table.  Get it. */
+    *sl3mfn = pagetable_get_mfn(v->arch.shadow_table);
+    /* This next line is important: the shadow l3 table is in an 8k
+     * shadow and we need to return the right mfn of the pair. This call
+     * will set it for us as a side-effect. */
+    (void) shadow_l3_index(sl3mfn, guest_index(gw->l3e));
+    ASSERT(v->arch.shadow_vtable);
+    return ((shadow_l3e_t *)v->arch.shadow_vtable) 
+        + shadow_l3_table_offset(gw->va);
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+}
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+
+static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v, 
+                                                walk_t *gw, 
+                                                mfn_t *sl2mfn,
+                                                fetch_type_t ft)
+{
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64bit... */
+    mfn_t sl3mfn = _mfn(INVALID_MFN);
+    shadow_l3e_t *sl3e;
+    if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
+    /* Get the l3e */
+    sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft);
+    ASSERT(sl3e != NULL);  /* Since we know guest PT is valid this far */
+    if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) 
+    {
+        *sl2mfn = shadow_l3e_get_mfn(*sl3e);
+        ASSERT(valid_mfn(*sl2mfn));
+    } 
+    else 
+    {
+        int r;
+        shadow_l3e_t new_sl3e;
+        /* No l2 shadow installed: find and install it. */
+        *sl2mfn = get_shadow_status(v, gw->l2mfn, PGC_SH2_l2_shadow);
+        if ( !valid_mfn(*sl2mfn) ) 
+        {
+            /* No l2 shadow of this page exists at all: make one. */
+            *sl2mfn = sh2_make_shadow(v, gw->l2mfn, PGC_SH2_l2_shadow);
+        }
+        /* Install the new sl2 table in the sl3e */
+        l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn, 
+                                 *sl2mfn, &new_sl3e, ft);
+        r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn);
+        ASSERT((r & SHADOW2_SET_FLUSH) == 0);
+#if GUEST_PAGING_LEVELS == 3 
+        /* Need to sync up the linear maps, as we are about to use them */
+        ASSERT( r & SHADOW2_SET_L3PAE_RECOPY );
+        sh2_pae_recopy(v->domain);
+#endif
+    }
+    /* Now follow it down a level.  Guaranteed to succeed. */
+    return sh2_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
+#else /* 32bit... */
+    /* There is always a shadow of the top level table.  Get it. */
+    *sl2mfn = pagetable_get_mfn(v->arch.shadow_table);
+    /* This next line is important: the guest l2 has a 16k
+     * shadow, we need to return the right mfn of the four. This
+     * call will set it for us as a side-effect. */
+    (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e));
+    /* Reading the top level table is always valid. */
+    return sh2_linear_l2_table(v) + shadow_l2_linear_offset(gw->va);
+#endif 
+}
+
+
+static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v, 
+                                                walk_t *gw, 
+                                                mfn_t *sl1mfn,
+                                                fetch_type_t ft)
+{
+    mfn_t sl2mfn;
+    shadow_l2e_t *sl2e;
+
+    /* Get the l2e */
+    sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft);
+    if ( sl2e == NULL ) return NULL;
+    if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT ) 
+    {
+        *sl1mfn = shadow_l2e_get_mfn(*sl2e);
+        ASSERT(valid_mfn(*sl1mfn));
+    } 
+    else 
+    {
+        shadow_l2e_t new_sl2e;
+        int r, flags = guest_l2e_get_flags(*gw->l2e);
+        /* No l1 shadow installed: find and install it. */
+        if ( !(flags & _PAGE_PRESENT) )
+            return NULL; /* No guest page. */
+        if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) ) 
+        {
+            /* Splintering a superpage */
+            gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e);
+            *sl1mfn = get_fl1_shadow_status(v, l2gfn);
+            if ( !valid_mfn(*sl1mfn) ) 
+            {
+                /* No fl1 shadow of this superpage exists at all: make one. */
+                *sl1mfn = make_fl1_shadow(v, l2gfn);
+            }
+        } 
+        else 
+        {
+            /* Shadowing an actual guest l1 table */
+            if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */
+            *sl1mfn = get_shadow_status(v, gw->l1mfn, PGC_SH2_l1_shadow);
+            if ( !valid_mfn(*sl1mfn) ) 
+            {
+                /* No l1 shadow of this page exists at all: make one. */
+                *sl1mfn = sh2_make_shadow(v, gw->l1mfn, PGC_SH2_l1_shadow);
+            }
+        }
+        /* Install the new sl1 table in the sl2e */
+        l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn, 
+                                 *sl1mfn, &new_sl2e, ft);
+        r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn);
+        ASSERT((r & SHADOW2_SET_FLUSH) == 0);        
+        /* This next line is important: in 32-on-PAE and 32-on-64 modes,
+         * the guest l1 table has an 8k shadow, and we need to return
+         * the right mfn of the pair. This call will set it for us as a
+         * side-effect.  (In all other cases, it's a no-op and will be
+         * compiled out.) */
+        (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va));
+    }
+    /* Now follow it down a level.  Guaranteed to succeed. */
+    return sh2_linear_l1_table(v) + shadow_l1_linear_offset(gw->va);
+}
+
+
+
+/**************************************************************************/
+/* Destructors for shadow tables: 
+ * Unregister the shadow, decrement refcounts of any entries present in it,
+ * and release the memory.
+ *
+ * N.B. These destructors do not clear the contents of the shadows.
+ *      This allows us to delay TLB shootdowns until the page is being reused.
+ *      See shadow2_alloc() and shadow2_free() for how this is handled.
+ */
+
+#if GUEST_PAGING_LEVELS >= 4
+void sh2_destroy_l4_shadow(struct vcpu *v, mfn_t smfn)
+{
+    shadow_l4e_t *sl4e;
+    u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask;
+    mfn_t gmfn, sl4mfn;
+    int xen_mappings;
+
+    SHADOW2_DEBUG(DESTROY_SHADOW,
+                  "%s(%05lx)\n", __func__, mfn_x(smfn));
+    ASSERT(t == PGC_SH2_l4_shadow);
+
+    /* Record that the guest page isn't shadowed any more (in this type) */
+    gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+    delete_shadow2_status(v, gmfn, t, smfn);
+    shadow2_demote(v, gmfn, t);
+    /* Take this shadow off the list of root shadows */
+    list_del_init(&mfn_to_page(smfn)->list);
+
+    /* Decrement refcounts of all the old entries */
+    xen_mappings = (!shadow2_mode_external(v->domain));
+    sl4mfn = smfn; 
+    SHADOW2_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
+        if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) 
+        {
+            sh2_put_ref(v, shadow_l4e_get_mfn(*sl4e),
+                        (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) 
+                        | ((unsigned long)sl4e & ~PAGE_MASK));
+        }
+    });
+    
+    /* Put the memory back in the pool */
+    shadow2_free(v->domain, smfn);
+}
+#endif    
+
+#if GUEST_PAGING_LEVELS >= 3
+void sh2_destroy_l3_shadow(struct vcpu *v, mfn_t smfn)
+{
+    shadow_l3e_t *sl3e;
+    u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask;
+    mfn_t gmfn, sl3mfn;
+
+    SHADOW2_DEBUG(DESTROY_SHADOW,
+                  "%s(%05lx)\n", __func__, mfn_x(smfn));
+    ASSERT(t == PGC_SH2_l3_shadow);
+
+    /* Record that the guest page isn't shadowed any more (in this type) */
+    gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+    delete_shadow2_status(v, gmfn, t, smfn);
+    shadow2_demote(v, gmfn, t);
+#if GUEST_PAGING_LEVELS == 3
+    /* Take this shadow off the list of root shadows */
+    list_del_init(&mfn_to_page(smfn)->list);
+#endif
+
+    /* Decrement refcounts of all the old entries */
+    sl3mfn = smfn; 
+    SHADOW2_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
+        if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) 
+            sh2_put_ref(v, shadow_l3e_get_mfn(*sl3e),
+                        (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT) 
+                        | ((unsigned long)sl3e & ~PAGE_MASK));
+    });
+
+    /* Put the memory back in the pool */
+    shadow2_free(v->domain, smfn);
+}
+#endif    
+
+
+#if GUEST_PAGING_LEVELS == 3
+static void sh2_destroy_l3_subshadow(struct vcpu *v, 
+                                     shadow_l3e_t *sl3e)
+/* Tear down just a single 4-entry l3 on a 2-page l3 shadow. */
+{
+    int i;
+    ASSERT((unsigned long)sl3e % (4 * sizeof (shadow_l3e_t)) == 0); 
+    for ( i = 0; i < GUEST_L3_PAGETABLE_ENTRIES; i++ ) 
+        if ( shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT ) 
+            sh2_put_ref(v, shadow_l3e_get_mfn(sl3e[i]),
+                        mapped_domain_page_to_maddr(sl3e));
+}
+#endif
+
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+void sh2_unpin_all_l3_subshadows(struct vcpu *v, mfn_t smfn)
+/* Walk a full PAE l3 shadow, unpinning all of the subshadows on it */
+{
+    int i, j;
+    struct pae_l3_bookkeeping *bk;
+    
+    ASSERT((mfn_to_page(smfn)->count_info & PGC_SH2_type_mask) 
+           == PGC_SH2_l3_pae_shadow);
+    /* The subshadows are split, 64 on each page of the shadow */
+    for ( i = 0; i < 2; i++ ) 
+    {
+        void *p = sh2_map_domain_page(_mfn(mfn_x(smfn) + i));
+        for ( j = 0; j < 64; j++ )
+        {
+            /* Every second 32-byte region is a bookkeeping entry */
+            bk = (struct pae_l3_bookkeeping *)(p + (64 * j) + 32);
+            if ( bk->pinned )
+                sh2_unpin_l3_subshadow(v, (shadow_l3e_t *)(p + (64*j)), smfn);
+            /* Check whether we've just freed the whole shadow */
+            if ( (mfn_to_page(smfn)->count_info & PGC_SH2_count_mask) == 0 ) 
+            {
+                sh2_unmap_domain_page(p);
+                return;
+            }
+        }
+        sh2_unmap_domain_page(p);
+    }
+}
+#endif
+
+void sh2_destroy_l2_shadow(struct vcpu *v, mfn_t smfn)
+{
+    shadow_l2e_t *sl2e;
+    u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask;
+    mfn_t gmfn, sl2mfn;
+    int xen_mappings;
+
+    SHADOW2_DEBUG(DESTROY_SHADOW,
+                  "%s(%05lx)\n", __func__, mfn_x(smfn));
+    ASSERT(t == PGC_SH2_l2_shadow 
+           || t == PGC_SH2_l2h_pae_shadow);
+
+    /* Record that the guest page isn't shadowed any more (in this type) */
+    gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+    delete_shadow2_status(v, gmfn, t, smfn);
+    shadow2_demote(v, gmfn, t);
+#if GUEST_PAGING_LEVELS == 2
+    /* Take this shadow off the list of root shadows */
+    list_del_init(&mfn_to_page(smfn)->list);
+#endif
+
+    /* Decrement refcounts of all the old entries */
+    sl2mfn = smfn;
+    xen_mappings = (!shadow2_mode_external(v->domain) &&
+                    ((GUEST_PAGING_LEVELS == 2) ||
+                     ((GUEST_PAGING_LEVELS == 3) &&
+                      (t == PGC_SH2_l2h_pae_shadow))));
+    SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
+        if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT ) 
+            sh2_put_ref(v, shadow_l2e_get_mfn(*sl2e),
+                        (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT) 
+                        | ((unsigned long)sl2e & ~PAGE_MASK));
+    });
+
+    /* Put the memory back in the pool */
+    shadow2_free(v->domain, smfn);
+}
+
+void sh2_destroy_l1_shadow(struct vcpu *v, mfn_t smfn)
+{
+    struct domain *d = v->domain;
+    shadow_l1e_t *sl1e;
+    u32 t = mfn_to_page(smfn)->count_info & PGC_SH2_type_mask;
+
+    SHADOW2_DEBUG(DESTROY_SHADOW,
+                  "%s(%05lx)\n", __func__, mfn_x(smfn));
+    ASSERT(t == PGC_SH2_l1_shadow || t == PGC_SH2_fl1_shadow);
+
+    /* Record that the guest page isn't shadowed any more (in this type) */
+    if ( t == PGC_SH2_fl1_shadow )
+    {
+        gfn_t gfn = _gfn(mfn_to_page(smfn)->u.inuse.type_info);
+        delete_fl1_shadow_status(v, gfn, smfn);
+    }
+    else 
+    {
+        mfn_t gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info);
+        delete_shadow2_status(v, gmfn, t, smfn);
+        shadow2_demote(v, gmfn, t);
+    }
+    
+    if ( shadow2_mode_refcounts(d) )
+    {
+        /* Decrement refcounts of all the old entries */
+        mfn_t sl1mfn = smfn; 
+        SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, 0, {
+            if ( shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT ) 
+                shadow2_put_page_from_l1e(*sl1e, d);
+        });
+    }
+    
+    /* Put the memory back in the pool */
+    shadow2_free(v->domain, smfn);
+}
+
+#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS
+void sh2_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
+{
+    struct domain *d = v->domain;
+    ASSERT((mfn_to_page(mmfn)->count_info & PGC_SH2_type_mask)
+           == PGC_SH2_monitor_table);
+
+#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4)
+    /* Need to destroy the l3 monitor page in slot 0 too */
+    {
+        l4_pgentry_t *l4e = sh2_map_domain_page(mmfn);
+        ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
+        shadow2_free(d, _mfn(l4e_get_pfn(l4e[0])));
+        sh2_unmap_domain_page(l4e);
+    }
+#elif CONFIG_PAGING_LEVELS == 3
+    /* Need to destroy the l2 monitor page in slot 4 too */
+    {
+        l3_pgentry_t *l3e = sh2_map_domain_page(mmfn);
+        ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
+        shadow2_free(d, _mfn(l3e_get_pfn(l3e[3])));
+        sh2_unmap_domain_page(l3e);
+    }
+#endif
+
+    /* Put the memory back in the pool */
+    shadow2_free(d, mmfn);
+}
+#endif
+
+/**************************************************************************/
+/* Functions to destroy non-Xen mappings in a pagetable hierarchy.
+ * These are called from common code when we are running out of shadow
+ * memory, and unpinning all the top-level shadows hasn't worked. 
+ *
+ * This implementation is pretty crude and slow, but we hope that it won't 
+ * be called very often. */
+
+#if GUEST_PAGING_LEVELS == 2
+
+void sh2_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn)
+{    
+    shadow_l2e_t *sl2e;
+    int xen_mappings = !shadow2_mode_external(v->domain);
+    SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, {
+        (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
+    });
+}
+
+#elif GUEST_PAGING_LEVELS == 3
+
+void sh2_unhook_pae_mappings(struct vcpu *v, mfn_t sl3mfn)
+/* Walk a full PAE l3 shadow, unhooking entries from all the subshadows */
+{
+    shadow_l3e_t *sl3e;
+    SHADOW2_FOREACH_L3E(sl3mfn, sl3e, 0, 0, {
+        if ( (shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) ) {
+            mfn_t sl2mfn = shadow_l3e_get_mfn(*sl3e);
+            if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH2_type_mask) 
+                 == PGC_SH2_l2h_pae_shadow ) 
+            {
+                /* High l2: need to pick particular l2es to unhook */
+                shadow_l2e_t *sl2e;
+                SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, 0, 1, {
+                    (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
+                });
+            }
+            else
+            {
+                /* Normal l2: can safely unhook the whole l3e */
+                (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
+            }
+        }
+    });
+    /* We've changed PAE L3 entries: must sync up various copies of them */
+    sh2_pae_recopy(v->domain);
+}
+
+#elif GUEST_PAGING_LEVELS == 4
+
+void sh2_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn)
+{
+    shadow_l4e_t *sl4e;
+    int xen_mappings = !shadow2_mode_external(v->domain);
+    SHADOW2_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, {
+        (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
+    });
+}
+
+#endif
+
+/**************************************************************************/
+/* Internal translation functions.
+ * These functions require a pointer to the shadow entry that will be updated.
+ */
+
+/* These functions take a new guest entry, translate it to shadow and write 
+ * the shadow entry.
+ *
+ * They return the same bitmaps as the shadow_set_lXe() functions.
+ */
+
+#if GUEST_PAGING_LEVELS >= 4
+static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se)
+{
+    shadow_l4e_t new_sl4e;
+    guest_l4e_t *new_gl4e = new_ge;
+    shadow_l4e_t *sl4p = se;
+    mfn_t sl3mfn = _mfn(INVALID_MFN);
+    int result = 0;
+
+    perfc_incrc(shadow2_validate_gl4e_calls);
+
+    if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT )
+    {
+        gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e);
+        mfn_t gl3mfn = vcpu_gfn_to_mfn(v, gl3gfn);
+        if ( valid_mfn(gl3mfn) )
+            sl3mfn = get_shadow_status(v, gl3mfn, PGC_SH2_l3_shadow);
+        else
+            result |= SHADOW2_SET_ERROR;
+    }
+    l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN),
+                             sl3mfn, &new_sl4e, ft_prefetch);
+    result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn);
+    return result;
+}
+#endif // GUEST_PAGING_LEVELS >= 4
+
+#if GUEST_PAGING_LEVELS >= 3
+static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se)
+{
+    shadow_l3e_t new_sl3e;
+    guest_l3e_t *new_gl3e = new_ge;
+    shadow_l3e_t *sl3p = se;
+    mfn_t sl2mfn = _mfn(INVALID_MFN);
+    int result = 0;
+
+    perfc_incrc(shadow2_validate_gl3e_calls);
+
+    if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT )
+    {
+        gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e);
+        mfn_t gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn);
+        if ( valid_mfn(gl2mfn) )
+            sl2mfn = get_shadow_status(v, gl2mfn, PGC_SH2_l2_shadow);
+        else
+            result |= SHADOW2_SET_ERROR;
+    }
+    l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN), 
+                             sl2mfn, &new_sl3e, ft_prefetch);
+    result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn);
+
+#if GUEST_PAGING_LEVELS == 3
+    /* We have changed a PAE l3 entry: need to sync up the possible copies 
+     * of it */
+    if ( result & SHADOW2_SET_L3PAE_RECOPY )
+        sh2_pae_recopy(v->domain);
+#endif
+
+    return result;
+}
+#endif // GUEST_PAGING_LEVELS >= 3
+
+static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se)
+{
+    shadow_l2e_t new_sl2e;
+    guest_l2e_t *new_gl2e = new_ge;
+    shadow_l2e_t *sl2p = se;
+    mfn_t sl1mfn = _mfn(INVALID_MFN);
+    int result = 0;
+
+    perfc_incrc(shadow2_validate_gl2e_calls);
+
+    if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT )
+    {
+        gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e);
+        if ( guest_supports_superpages(v) &&
+             (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) )
+        {
+            // superpage -- need to look up the shadow L1 which holds the
+            // splitters...
+            sl1mfn = get_fl1_shadow_status(v, gl1gfn);
+#if 0
+            // XXX - it's possible that we want to do some kind of prefetch
+            // for superpage fl1's here, but this is *not* on the demand path,
+            // so we'll hold off trying that for now...
+            //
+            if ( !valid_mfn(sl1mfn) )
+                sl1mfn = make_fl1_shadow(v, gl1gfn);
+#endif
+        }
+        else
+        {
+            mfn_t gl1mfn = vcpu_gfn_to_mfn(v, gl1gfn);
+            if ( valid_mfn(gl1mfn) )
+                sl1mfn = get_shadow_status(v, gl1mfn, PGC_SH2_l1_shadow);
+            else
+                result |= SHADOW2_SET_ERROR;
+        }
+    }
+    l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN),
+                             sl1mfn, &new_sl2e, ft_prefetch);
+    result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn);
+
+    return result;
+}
+
+static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se)
+{
+    shadow_l1e_t new_sl1e;
+    guest_l1e_t *new_gl1e = new_ge;
+    shadow_l1e_t *sl1p = se;
+    gfn_t gfn;
+    mfn_t mfn;
+    int result = 0;
+
+    perfc_incrc(shadow2_validate_gl1e_calls);
+
+    gfn = guest_l1e_get_gfn(*new_gl1e);
+    mfn = vcpu_gfn_to_mfn(v, gfn);
+
+    l1e_propagate_from_guest(v, *new_gl1e, &new_sl1e, 
+                             /* mmio? */ !valid_mfn(mfn));
+    
+    result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
+    return result;
+}
+
+
+/**************************************************************************/
+/* Functions which translate and install a the shadows of arbitrary guest 
+ * entries that we have just seen the guest write. */
+
+
+static inline int 
+sh2_map_and_validate(struct vcpu *v, mfn_t gmfn,
+                     void *new_gp, u32 size, u32 sh_type, 
+                     u32 (*shadow_index)(mfn_t *smfn, u32 idx),
+                     int (*validate_ge)(struct vcpu *v, void *ge, 
+                                        mfn_t smfn, void *se))
+/* Generic function for mapping and validating. */
+{
+    mfn_t smfn, smfn2, map_mfn;
+    shadow_l1e_t *sl1p;
+    u32 shadow_idx, guest_idx;
+    int result = 0;
+
+    /* Align address and size to guest entry boundaries */
+    size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1);
+    new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1));
+    size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1);
+    ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE);
+
+    /* Map the shadow page */
+    smfn = get_shadow_status(v, gmfn, sh_type);
+    ASSERT(valid_mfn(smfn)); /* Otherwise we would not have been called */
+    guest_idx = guest_index(new_gp);
+    map_mfn = smfn;
+    shadow_idx = shadow_index(&map_mfn, guest_idx);
+    sl1p = map_shadow_page(map_mfn);
+
+    /* Validate one entry at a time */
+    while ( size )
+    {
+        smfn2 = smfn;
+        guest_idx = guest_index(new_gp);
+        shadow_idx = shadow_index(&smfn2, guest_idx);
+        if ( mfn_x(smfn2) != mfn_x(map_mfn) )
+        {
+            /* We have moved to another page of the shadow */
+            map_mfn = smfn2;
+            unmap_shadow_page(sl1p);
+            sl1p = map_shadow_page(map_mfn);
+        }
+        result |= validate_ge(v,
+                              new_gp,
+                              map_mfn,
+                              &sl1p[shadow_idx]);
+        size -= sizeof(guest_l1e_t);
+        new_gp += sizeof(guest_l1e_t);
+    }
+    unmap_shadow_page(sl1p);
+    return result;
+}
+
+
+int
+sh2_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn,
+                          void *new_gl4p, u32 size)
+{
+#if GUEST_PAGING_LEVELS >= 4
+    return sh2_map_and_validate(v, gl4mfn, new_gl4p, size, 
+                                PGC_SH2_l4_shadow, 
+                                shadow_l4_index, 
+                                validate_gl4e);
+#else // ! GUEST_PAGING_LEVELS >= 4
+    SHADOW2_PRINTK("called in wrong paging mode!\n");
+    BUG();
+    return 0;
+#endif 
+}
+    
+int
+sh2_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn,
+                          void *new_gl3p, u32 size)
+{
+#if GUEST_PAGING_LEVELS >= 3
+    return sh2_map_and_validate(v, gl3mfn, new_gl3p, size, 
+                                PGC_SH2_l3_shadow, 
+                                shadow_l3_index, 
+                                validate_gl3e);
+#else // ! GUEST_PAGING_LEVELS >= 3
+    SHADOW2_PRINTK("called in wrong paging mode!\n");
+    BUG();
+    return 0;
+#endif
+}
+
+int
+sh2_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn,
+                          void *new_gl2p, u32 size)
+{
+    return sh2_map_and_validate(v, gl2mfn, new_gl2p, size, 
+                                PGC_SH2_l2_shadow, 
+                                shadow_l2_index, 
+                                validate_gl2e);
+}
+
+int
+sh2_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn,
+                           void *new_gl2p, u32 size)
+{
+#if GUEST_PAGING_LEVELS == 3
+    return sh2_map_and_validate(v, gl2mfn, new_gl2p, size, 
+                                PGC_SH2_l2h_shadow, 
+                                shadow_l2_index, 
+                                validate_gl2e);
+#else /* Non-PAE guests don't have different kinds of l2 table */
+    SHADOW2_PRINTK("called in wrong paging mode!\n");
+    BUG();
+    return 0;
+#endif
+}
+
+int
+sh2_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn,
+                          void *new_gl1p, u32 size)
+{
+    return sh2_map_and_validate(v, gl1mfn, new_gl1p, size, 
+                                PGC_SH2_l1_shadow, 
+                                shadow_l1_index, 
+                                validate_gl1e);
+}
+
+
+/**************************************************************************/
+/* Optimization: If we see two emulated writes of zeros to the same
+ * page-table without another kind of page fault in between, we guess
+ * that this is a batch of changes (for process destruction) and
+ * unshadow the page so we don't take a pagefault on every entry.  This
+ * should also make finding writeable mappings of pagetables much
+ * easier. */
+
+/* Look to see if this is the second emulated write in a row to this
+ * page, and unshadow/unhook if it is */
+static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn)
+{
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_EARLY_UNSHADOW
+    if ( v->arch.last_emulated_mfn == mfn_x(gmfn) &&
+         sh2_mfn_is_a_page_table(gmfn) )
+    {
+        u32 flags = mfn_to_page(gmfn)->shadow2_flags;
+        mfn_t smfn;
+        if ( !(flags & (SH2F_L2_32|SH2F_L3_PAE|SH2F_L4_64)) )
+        {
+            perfc_incrc(shadow2_early_unshadow);
+            sh2_remove_shadows(v, gmfn, 0 /* Can fail to unshadow */ );
+            return;
+        }
+        /* SH2F_unhooked_mappings is set to make sure we only unhook
+         * once in a single batch of updates. It is reset when this
+         * top-level page is loaded into CR3 again */
+        if ( !(flags & SH2F_unhooked_mappings) ) 
+        {
+            perfc_incrc(shadow2_early_unshadow_top);
+            mfn_to_page(gmfn)->shadow2_flags |= SH2F_unhooked_mappings;
+            if ( flags & SH2F_L2_32 )
+            {
+                smfn = get_shadow_status(v, gmfn, PGC_SH2_l2_32_shadow);
+                shadow2_unhook_mappings(v, smfn);
+            }
+            if ( flags & SH2F_L3_PAE ) 
+            {
+                smfn = get_shadow_status(v, gmfn, PGC_SH2_l3_pae_shadow);
+                shadow2_unhook_mappings(v, smfn);
+            }
+            if ( flags & SH2F_L4_64 ) 
+            {
+                smfn = get_shadow_status(v, gmfn, PGC_SH2_l4_64_shadow);
+                shadow2_unhook_mappings(v, smfn);
+            }
+        }
+    }
+    v->arch.last_emulated_mfn = mfn_x(gmfn);
+#endif
+}
+
+/* Stop counting towards early unshadows, as we've seen a real page fault */
+static inline void reset_early_unshadow(struct vcpu *v)
+{
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_EARLY_UNSHADOW
+    v->arch.last_emulated_mfn = INVALID_MFN;
+#endif
+}
+
+
+
+/**************************************************************************/
+/* Entry points into the shadow code */
+
+/* Called from pagefault handler in Xen, and from the HVM trap handlers
+ * for pagefaults.  Returns 1 if this fault was an artefact of the
+ * shadow code (and the guest should retry) or 0 if it is not (and the
+ * fault should be handled elsewhere or passed to the guest). */
+
+static int sh2_page_fault(struct vcpu *v, 
+                          unsigned long va, 
+                          struct cpu_user_regs *regs)
+{
+    struct domain *d = v->domain;
+    walk_t gw;
+    u32 accumulated_gflags;
+    gfn_t gfn;
+    mfn_t gmfn, sl1mfn=_mfn(0);
+    shadow_l1e_t sl1e, *ptr_sl1e;
+    paddr_t gpa;
+    struct cpu_user_regs emul_regs;
+    struct x86_emulate_ctxt emul_ctxt;
+    int r, mmio;
+    fetch_type_t ft = 0;
+
+    //
+    // XXX: Need to think about eventually mapping superpages directly in the
+    //      shadow (when possible), as opposed to splintering them into a
+    //      bunch of 4K maps.
+    //
+
+    SHADOW2_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
+                   v->domain->domain_id, v->vcpu_id, va, regs->error_code);
+    
+    shadow2_lock(d);
+
+    shadow2_audit_tables(v);
+                   
+    if ( guest_walk_tables(v, va, &gw, 1) != 0 )
+    {
+        SHADOW2_PRINTK("malformed guest pagetable!");
+        print_gw(&gw);
+    }
+
+    sh2_audit_gw(v, &gw);
+
+    // We do not look at the gw->l1e, as that will not exist for superpages.
+    // Instead, we use the gw->eff_l1e...
+    //
+    // We need not check all the levels of the guest page table entries for
+    // present vs not-present, as the eff_l1e will always be not present if
+    // one of the higher level entries is not present.
+    //
+    if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) )
+    {
+        if ( hvm_guest(v) && !shadow2_vcpu_mode_translate(v) )
+        {
+            /* Not present in p2m map, means this is mmio */
+            gpa = va;
+            goto mmio;
+        }
+
+        perfc_incrc(shadow2_fault_bail_not_present);
+        goto not_a_shadow_fault;
+    }
+
+    // All levels of the guest page table are now known to be present.
+    accumulated_gflags = accumulate_guest_flags(&gw);
+
+    // Check for attempts to access supervisor-only pages from user mode,
+    // i.e. ring 3.  Such errors are not caused or dealt with by the shadow
+    // code.
+    //
+    if ( (regs->error_code & X86_PFEC_SUPERVISOR_FAULT) &&
+         !(accumulated_gflags & _PAGE_USER) )
+    {
+        /* illegal user-mode access to supervisor-only page */
+        perfc_incrc(shadow2_fault_bail_user_supervisor);
+        goto not_a_shadow_fault;
+    }
+
+    // Was it a write fault?
+    //
+    if ( regs->error_code & X86_PFEC_WRITE_FAULT )
+    {
+        if ( unlikely(!(accumulated_gflags & _PAGE_RW)) )
+        {
+            perfc_incrc(shadow2_fault_bail_ro_mapping);
+            goto not_a_shadow_fault;
+        }
+    }
+    else // must have been either an insn fetch or read fault
+    {
+        // Check for NX bit violations: attempts to execute code that is
+        // marked "do not execute".  Such errors are not caused or dealt with
+        // by the shadow code.
+        //
+        if ( regs->error_code & X86_PFEC_INSN_FETCH_FAULT )
+        {
+            if ( accumulated_gflags & _PAGE_NX_BIT )
+            {
+                /* NX prevented this code fetch */
+                perfc_incrc(shadow2_fault_bail_nx);
+                goto not_a_shadow_fault;
+            }
+        }
+    }
+
+    /* Is this an MMIO access? */
+    gfn = guest_l1e_get_gfn(gw.eff_l1e);
+    mmio = ( hvm_guest(v) 
+             && shadow2_vcpu_mode_translate(v) 
+             && mmio_space(gfn_to_paddr(gfn)) );
+
+    /* For MMIO, the shadow holds the *gfn*; for normal accesses, if holds 
+     * the equivalent mfn. */
+    if ( mmio ) 
+        gmfn = _mfn(gfn_x(gfn));
+    else
+    {
+        gmfn = vcpu_gfn_to_mfn(v, gfn);
+        if ( !valid_mfn(gmfn) )
+        {
+            perfc_incrc(shadow2_fault_bail_bad_gfn);
+            SHADOW2_PRINTK("BAD gfn=%"SH2_PRI_gfn" gmfn=%"SH2_PRI_mfn"\n", 
+                           gfn_x(gfn), mfn_x(gmfn));
+            goto not_a_shadow_fault;
+        }
+    }
+
+    /* Make sure there is enough free shadow memory to build a chain of
+     * shadow tables: one SHADOW2_MAX_ORDER chunk will always be enough
+     * to allocate all we need.  (We never allocate a top-level shadow
+     * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */
+    shadow2_prealloc(d, SHADOW2_MAX_ORDER);
+
+    /* Acquire the shadow.  This must happen before we figure out the rights 
+     * for the shadow entry, since we might promote a page here. */
+    // XXX -- this code will need to change somewhat if/when the shadow code
+    // can directly map superpages...
+    ft = ((regs->error_code & X86_PFEC_WRITE_FAULT) 
+          ? ft_demand_write : ft_demand_read);
+    ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
+    ASSERT(ptr_sl1e);
+
+    /* Calculate the shadow entry */
+    if ( ft == ft_demand_write )
+    {
+        if ( l1e_write_fault(v, &gw, gmfn, &sl1e, mmio) )
+        {
+            perfc_incrc(shadow2_fault_emulate_write);
+            goto emulate;
+        }
+    }
+    else if ( l1e_read_fault(v, &gw, gmfn, &sl1e, mmio) )
+    {
+        perfc_incrc(shadow2_fault_emulate_read);
+        goto emulate;
+    }
+
+    /* Quick sanity check: we never make an MMIO entry that's got the 
+     * _PAGE_PRESENT flag set in it. */
+    ASSERT(!mmio || !(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT));
+
+    r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
+
+    if ( mmio ) 
+    {
+        gpa = guest_walk_to_gpa(&gw);
+        goto mmio;
+    }
+
+#if 0
+    if ( !(r & SHADOW2_SET_CHANGED) )
+        debugtrace_printk("%s: shadow_set_l1e(va=%p, sl1e=%" SH2_PRI_pte
+                          ") did not change anything\n",
+                          __func__, gw.va, l1e_get_intpte(sl1e));
+#endif
+
+    perfc_incrc(shadow2_fault_fixed);
+    d->arch.shadow_fault_count++;
+    reset_early_unshadow(v);
+
+ done:
+    sh2_audit_gw(v, &gw);
+    unmap_walk(v, &gw);
+    SHADOW2_PRINTK("fixed\n");
+    shadow2_audit_tables(v);
+    shadow2_unlock(d);
+    return EXCRET_fault_fixed;
+
+ emulate:
+
+    /* Take the register set we were called with */
+    emul_regs = *regs;
+    if ( hvm_guest(v) )
+    {
+        /* Add the guest's segment selectors, rip, rsp. rflags */ 
+        hvm_store_cpu_guest_regs(v, &emul_regs, NULL);
+    }
+    emul_ctxt.regs = &emul_regs;
+    emul_ctxt.cr2 = va;
+    emul_ctxt.mode = hvm_guest(v) ? hvm_guest_x86_mode(v) : X86EMUL_MODE_HOST;
+
+    SHADOW2_PRINTK("emulate: eip=%#lx\n", emul_regs.eip);
+
+    v->arch.shadow2_propagate_fault = 0;
+    if ( x86_emulate_memop(&emul_ctxt, &shadow2_emulator_ops) )
+    {
+        SHADOW2_PRINTK("emulator failure, unshadowing mfn %#lx\n", 
+                       mfn_x(gmfn));
+        perfc_incrc(shadow2_fault_emulate_failed);
+        /* If this is actually a page table, then we have a bug, and need 
+         * to support more operations in the emulator.  More likely, 
+         * though, this is a hint that this page should not be shadowed. */
+        shadow2_remove_all_shadows(v, gmfn);
+        /* This means that actual missing operations will cause the 
+         * guest to loop on the same page fault. */
+        goto done;
+    }
+    if ( v->arch.shadow2_propagate_fault )
+    {
+        /* Emulation triggered another page fault */
+        goto not_a_shadow_fault;
+    }
+
+    /* Emulator has changed the user registers: write back */
+    if ( hvm_guest(v) )
+    {
+        /* Write back the guest's segment selectors, rip, rsp. rflags */ 
+        hvm_load_cpu_guest_regs(v, &emul_regs);
+        /* And don't overwrite those in the caller's regs. */
+        emul_regs.eip = regs->eip;
+        emul_regs.cs = regs->cs;
+        emul_regs.eflags = regs->eflags;
+        emul_regs.esp = regs->esp;
+        emul_regs.ss = regs->ss;
+        emul_regs.es = regs->es;
+        emul_regs.ds = regs->ds;
+        emul_regs.fs = regs->fs;
+        emul_regs.gs = regs->gs;
+    }
+    *regs = emul_regs;
+
+    goto done;
+
+ mmio:
+    perfc_incrc(shadow2_fault_mmio);
+    if ( !hvm_apic_support(d) && (gpa >= 0xFEC00000) )
+    {
+        /* Need to deal with these disabled-APIC accesses, as
+         * handle_mmio() apparently does not currently do that. */
+        /* TJD: What about it, then?   For now, I'm turning this BUG() 
+         * into a domain_crash() since we don't want to kill Xen. */
+        SHADOW2_ERROR("disabled-APIC access: not supported\n.");
+        domain_crash(d); 
+    }
+    sh2_audit_gw(v, &gw);
+    unmap_walk(v, &gw);
+    SHADOW2_PRINTK("mmio\n");
+    shadow2_audit_tables(v);
+    reset_early_unshadow(v);
+    shadow2_unlock(d);
+    sh2_log_mmio(v, gpa);
+    handle_mmio(va, gpa);
+    return EXCRET_fault_fixed;
+
+ not_a_shadow_fault:
+    sh2_audit_gw(v, &gw);
+    unmap_walk(v, &gw);
+    SHADOW2_PRINTK("not a shadow fault\n");
+    shadow2_audit_tables(v);
+    reset_early_unshadow(v);
+    shadow2_unlock(d);
+    return 0;
+}
+
+
+static int
+sh2_invlpg(struct vcpu *v, unsigned long va)
+/* Called when the guest requests an invlpg.  Returns 1 if the invlpg
+ * instruction should be issued on the hardware, or 0 if it's safe not
+ * to do so. */
+{
+    shadow_l2e_t *ptr_sl2e = shadow_get_l2e(v, va);
+
+    // XXX -- might be a good thing to prefetch the va into the shadow
+
+    // no need to flush anything if there's no SL2...
+    //
+    if ( !ptr_sl2e )
+        return 0;
+
+    // If there's nothing shadowed for this particular sl2e, then
+    // there is no need to do an invlpg, either...
+    //
+    if ( !(shadow_l2e_get_flags(*ptr_sl2e) & _PAGE_PRESENT) )
+        return 0;
+
+    // Check to see if the SL2 is a splintered superpage...
+    // If so, then we'll need to flush the entire TLB (because that's
+    // easier than invalidating all of the individual 4K pages).
+    //
+    if ( (mfn_to_page(shadow_l2e_get_mfn(*ptr_sl2e))->count_info &
+          PGC_SH2_type_mask) == PGC_SH2_fl1_shadow )
+    {
+        local_flush_tlb();
+        return 0;
+    }
+
+    return 1;
+}
+
+static unsigned long
+sh2_gva_to_gfn(struct vcpu *v, unsigned long va)
+/* Called to translate a guest virtual address to what the *guest*
+ * pagetables would map it to. */
+{
+    walk_t gw;
+    gfn_t gfn;
+
+    guest_walk_tables(v, va, &gw, 0);
+    gfn = guest_walk_to_gfn(&gw);
+    unmap_walk(v, &gw);
+
+    return gfn_x(gfn);
+}
+
+
+static unsigned long
+sh2_gva_to_gpa(struct vcpu *v, unsigned long va)
+/* Called to translate a guest virtual address to what the *guest*
+ * pagetables would map it to. */
+{
+    unsigned long gfn = sh2_gva_to_gfn(v, va);
+    if ( gfn == INVALID_GFN )
+        return 0;
+    else
+        return (gfn << PAGE_SHIFT) | (va & ~PAGE_MASK);
+}
+
+
+// XXX -- should this be in this file?
+//        Or should it be moved to shadow2-common.c?
+//
+/* returns a lowmem machine address of the copied HVM L3 root table
+ * If clear_res != 0, then clear the PAE-l3 reserved bits in the copy,
+ * otherwise blank out any entries with reserved bits in them.  */
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+static unsigned long
+hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res)
+{
+    int i, f;
+    int res = (_PAGE_RW|_PAGE_NX_BIT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY);
+    l3_pgentry_t new_l3e, *copy = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+    memcpy(copy, l3tab, 4 * sizeof(l3_pgentry_t));
+    for ( i = 0; i < 4; i++ )
+    {
+        f = l3e_get_flags(l3tab[i]);
+        if ( (f & _PAGE_PRESENT) && (!(f & res) || clear_res) )
+            new_l3e = l3e_from_pfn(l3e_get_pfn(l3tab[i]), f & ~res);
+        else
+            new_l3e = l3e_empty();
+        safe_write_entry(&copy[i], &new_l3e);
+    }
+    return __pa(copy);
+}
+#endif
+
+
+static inline void
+sh2_update_linear_entries(struct vcpu *v)
+/* Sync up all the linear mappings for this vcpu's pagetables */
+{
+    struct domain *d = v->domain;
+
+    /* Linear pagetables in PV guests
+     * ------------------------------
+     *
+     * Guest linear pagetables, which map the guest pages, are at
+     * LINEAR_PT_VIRT_START.  Shadow linear pagetables, which map the
+     * shadows, are at SH_LINEAR_PT_VIRT_START.  Most of the time these
+     * are set up at shadow creation time, but (of course!) the PAE case
+     * is subtler.  Normal linear mappings are made by having an entry
+     * in the top-level table that points to itself (shadow linear) or
+     * to the guest top-level table (guest linear).  For PAE, to set up
+     * a linear map requires us to copy the four top-level entries into 
+     * level-2 entries.  That means that every time we change a PAE l3e,
+     * we need to reflect the change into the copy.
+     *
+     * Linear pagetables in HVM guests
+     * -------------------------------
+     *
+     * For HVM guests, the linear pagetables are installed in the monitor
+     * tables (since we can't put them in the shadow).  Shadow linear
+     * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START,
+     * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for 
+     * a linear pagetable of the monitor tables themselves.  We have 
+     * the same issue of having to re-copy PAE l3 entries whevever we use
+     * PAE shadows. 
+     *
+     * Because HVM guests run on the same monitor tables regardless of the 
+     * shadow tables in use, the linear mapping of the shadow tables has to 
+     * be updated every time v->arch.shadow_table changes. 
+     */
+
+    /* Don't try to update the monitor table if it doesn't exist */
+    if ( shadow2_mode_external(d) 
+         && pagetable_get_pfn(v->arch.monitor_table) == 0 ) 
+        return;
+
+#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4)
+    
+    /* For PV, one l4e points at the guest l4, one points at the shadow
+     * l4.  No maintenance required. 
+     * For HVM, just need to update the l4e that points to the shadow l4. */
+
+    if ( shadow2_mode_external(d) )
+    {
+        /* Use the linear map if we can; otherwise make a new mapping */
+        if ( v == current ) 
+        {
+            __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] = 
+                l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+                             __PAGE_HYPERVISOR);
+        } 
+        else
+        { 
+            l4_pgentry_t *ml4e;
+            ml4e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+            ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] = 
+                l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+                             __PAGE_HYPERVISOR);
+            sh2_unmap_domain_page(ml4e);
+        }
+    }
+
+#elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3)
+
+    /* This case only exists in HVM.  To give ourselves a linear map of the 
+     * shadows, we need to extend a PAE shadow to 4 levels.  We do this by 
+     * having a monitor l3 in slot 0 of the monitor l4 table, and 
+     * copying the PAE l3 entries into it.  Then, by having the monitor l4e
+     * for shadow pagetables also point to the monitor l4, we can use it
+     * to access the shadows. */
+
+    if ( shadow2_mode_external(d) )
+    {
+        /* Install copies of the shadow l3es into the monitor l3 table.
+         * The monitor l3 table is hooked into slot 0 of the monitor
+         * l4 table, so we use l3 linear indices 0 to 3 */
+        shadow_l3e_t *sl3e;
+        l3_pgentry_t *ml3e;
+        mfn_t l3mfn;
+        int i;
+
+        /* Use linear mappings if we can; otherwise make new mappings */
+        if ( v == current ) 
+        {
+            ml3e = __linear_l3_table;
+            l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0]));
+#if GUEST_PAGING_LEVELS == 2
+            /* Shadow l3 tables are made up by update_cr3 */
+            sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+#else
+            sl3e = v->arch.shadow_vtable;
+#endif
+        }
+        else 
+        {   
+            l4_pgentry_t *ml4e;
+            ml4e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+            ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT);
+            l3mfn = _mfn(l4e_get_pfn(ml4e[0]));
+            ml3e = sh2_map_domain_page(l3mfn);
+            sh2_unmap_domain_page(ml4e);
+#if GUEST_PAGING_LEVELS == 2
+            /* Shadow l3 tables are made up by update_cr3 */
+            sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+#else
+            sl3e = sh2_map_domain_page(pagetable_get_mfn(v->arch.shadow_table));
+#endif
+        }
+
+        for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
+        {
+            ml3e[i] = 
+                (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT) 
+                ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])), 
+                               __PAGE_HYPERVISOR) 
+                : l3e_empty();
+        }
+
+        if ( v != current ) 
+        {
+            sh2_unmap_domain_page(ml3e);
+#if GUEST_PAGING_LEVELS != 2
+            sh2_unmap_domain_page(sl3e);
+#endif
+        }
+    }
+
+#elif CONFIG_PAGING_LEVELS == 3
+
+    /* PV: need to copy the guest's l3 entries into the guest-linear-map l2
+     * entries in the shadow, and the shadow's l3 entries into the 
+     * shadow-linear-map l2 entries in the shadow.  This is safe to do 
+     * because Xen does not let guests share high-slot l2 tables between l3s,
+     * so we know we're not treading on anyone's toes. 
+     *
+     * HVM: need to copy the shadow's l3 entries into the
+     * shadow-linear-map l2 entries in the monitor table.  This is safe
+     * because we have one monitor table for each vcpu.  The monitor's
+     * own l3es don't need to be copied because they never change.  
+     * XXX That might change if we start stuffing things into the rest
+     * of the monitor's virtual address space. 
+     */ 
+    {
+        l2_pgentry_t *l2e, new_l2e;
+        shadow_l3e_t *guest_l3e = NULL, *shadow_l3e;
+        int i;
+
+#if GUEST_PAGING_LEVELS == 2
+        /* Shadow l3 tables were built by update_cr3 */
+        if ( shadow2_mode_external(d) )
+            shadow_l3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab;
+        else
+            BUG(); /* PV 2-on-3 is not supported yet */
+        
+#else /* GUEST_PAGING_LEVELS == 3 */
+        
+        /* Use local vcpu's mappings if we can; otherwise make new mappings */
+        if ( v == current ) 
+        {
+            shadow_l3e = v->arch.shadow_vtable;
+            if ( !shadow2_mode_external(d) )
+                guest_l3e = v->arch.guest_vtable;
+        }
+        else 
+        {
+            mfn_t smfn;
+            int idx;
+            
+            /* Map the shadow l3 */
+            smfn = pagetable_get_mfn(v->arch.shadow_table);
+            idx = shadow_l3_index(&smfn, guest_index(v->arch.shadow_vtable));
+            shadow_l3e = sh2_map_domain_page(smfn);
+            shadow_l3e += idx;
+            if ( !shadow2_mode_external(d) )
+            {
+                /* Also the guest l3 */
+                mfn_t gmfn = pagetable_get_mfn(v->arch.guest_table); 
+                guest_l3e = sh2_map_domain_page(gmfn);
+                guest_l3e += guest_index(v->arch.guest_vtable);
+            }
+        }
+#endif /* GUEST_PAGING_LEVELS */
+        
+        /* Choose where to write the entries, using linear maps if possible */
+        if ( v == current && shadow2_mode_external(d) ) 
+        {
+            /* From the monitor tables, it's safe to use linear maps to update
+             * monitor l2s */
+            l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES);
+        }
+        else if ( shadow2_mode_external(d) ) 
+        {
+            /* Map the monitor table's high l2 */
+            l3_pgentry_t *l3e;
+            l3e = sh2_map_domain_page(
+                pagetable_get_mfn(v->arch.monitor_table));
+            ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT);
+            l2e = sh2_map_domain_page(_mfn(l3e_get_pfn(l3e[3])));
+            sh2_unmap_domain_page(l3e);
+        } 
+        else 
+        {
+            /* Map the shadow table's high l2 */
+            ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT);
+            l2e = sh2_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3]));
+        }
+        
+        
+        if ( !shadow2_mode_external(d) )
+        {
+            /* Write linear mapping of guest. */
+            for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
+            { 
+                new_l2e = (shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT) 
+                    ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])),
+                                   __PAGE_HYPERVISOR) 
+                    : l2e_empty();
+                safe_write_entry(
+                    &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i],
+                    &new_l2e);
+            }
+        }
+        
+        /* Write linear mapping of shadow. */
+        for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ )
+        {
+            new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT) 
+                ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])),
+                               __PAGE_HYPERVISOR) 
+                : l2e_empty();
+            safe_write_entry(
+                &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i],
+                &new_l2e);
+        }
+        
+        if ( v != current || !shadow2_mode_external(d) )
+            sh2_unmap_domain_page(l2e);
+        
+#if GUEST_PAGING_LEVELS == 3
+        if ( v != current) 
+        {
+            sh2_unmap_domain_page(shadow_l3e);
+            if ( !shadow2_mode_external(d) )
+                sh2_unmap_domain_page(guest_l3e);
+        }
+#endif
+    }
+
+#elif CONFIG_PAGING_LEVELS == 2
+
+    /* For PV, one l2e points at the guest l2, one points at the shadow
+     * l2. No maintenance required. 
+     * For HVM, just need to update the l2e that points to the shadow l2. */
+
+    if ( shadow2_mode_external(d) )
+    {
+        /* Use the linear map if we can; otherwise make a new mapping */
+        if ( v == current ) 
+        {
+            __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] = 
+                l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+                             __PAGE_HYPERVISOR);
+        } 
+        else
+        { 
+            l2_pgentry_t *ml2e;
+            ml2e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+            ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = 
+                l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table),
+                             __PAGE_HYPERVISOR);
+            sh2_unmap_domain_page(ml2e);
+        }
+    }
+
+#else
+#error this should not happen
+#endif
+}
+
+
+// XXX -- should this be in this file?
+//        Or should it be moved to shadow2-common.c?
+//
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+void sh2_pae_recopy(struct domain *d)
+/* Called whenever we write to the l3 entries of a PAE pagetable which 
+ * is currently in use.  Each vcpu that is using the table needs to 
+ * resync its copies of the l3s in linear maps and any low-memory
+ * copies it might have made for fitting into 32bit CR3.
+ * Since linear maps are also resynced when we change CR3, we don't
+ * need to worry about changes to PAE l3es that are not currently in use.*/
+{
+    struct vcpu *v;
+    cpumask_t flush_mask = CPU_MASK_NONE;
+    ASSERT(shadow2_lock_is_acquired(d));
+    
+    for_each_vcpu(d, v)
+    {
+        if ( !v->arch.shadow2_pae_flip_pending ) 
+            continue;
+
+        cpu_set(v->processor, flush_mask);
+        
+        SHADOW2_PRINTK("d=%u v=%u\n", v->domain->domain_id, v->vcpu_id);
+
+        /* This vcpu has a copy in its linear maps */
+        sh2_update_linear_entries(v);
+        if ( hvm_guest(v) )
+        {
+            /* This vcpu has a copy in its HVM PAE l3 */
+            v->arch.hvm_vcpu.hw_cr3 = 
+                hvm_pae_copy_root(v, v->arch.shadow_vtable,
+                                  !shadow2_vcpu_mode_translate(v));
+        }
+#if CONFIG_PAGING_LEVELS == 3
+        else 
+        {
+            /* This vcpu might have copied the l3 to below 4GB */
+            if ( v->arch.cr3 >> PAGE_SHIFT 
+                 != pagetable_get_pfn(v->arch.shadow_table) )
+            {
+                /* Recopy to where that copy is. */
+                int i;
+                l3_pgentry_t *dst, *src;
+                dst = __va(v->arch.cr3 & ~0x1f); /* Mask cache control bits */
+                src = v->arch.shadow_vtable;
+                for ( i = 0 ; i < 4 ; i++ ) 
+                    safe_write_entry(dst + i, src + i);
+            }
+        }
+#endif
+        v->arch.shadow2_pae_flip_pending = 0;        
+    }
+
+    flush_tlb_mask(flush_mask);
+}
+#endif /* (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) */
+
+
+/* removes:
+ *     vcpu->arch.guest_vtable
+ *     vcpu->arch.shadow_table
+ *     vcpu->arch.shadow_vtable
+ * Does all appropriate management/bookkeeping/refcounting/etc...
+ */
+static void
+sh2_detach_old_tables(struct vcpu *v)
+{
+    mfn_t smfn;
+
+    ////
+    //// vcpu->arch.guest_vtable
+    ////
+    if ( (shadow2_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) &&
+         v->arch.guest_vtable )
+    {
+        // Q: why does this need to use (un)map_domain_page_*global* ?
+        sh2_unmap_domain_page_global(v->arch.guest_vtable);
+        v->arch.guest_vtable = NULL;
+    }
+
+    ////
+    //// vcpu->arch.shadow_table
+    ////
+    smfn = pagetable_get_mfn(v->arch.shadow_table);
+    if ( mfn_x(smfn) )
+    {
+        ASSERT(v->arch.shadow_vtable);
+
+#if GUEST_PAGING_LEVELS == 3
+        // PAE guests do not (necessarily) use an entire page for their
+        // 4-entry L3s, so we have to deal with them specially.
+        //
+        sh2_put_ref_l3_subshadow(v, v->arch.shadow_vtable, smfn);
+#else
+        sh2_put_ref(v, smfn, 0);
+#endif
+
+#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
+        {
+            struct pae_l3_bookkeeping *info =
+                sl3p_to_info(v->arch.shadow_vtable);
+            ASSERT(test_bit(v->vcpu_id, &info->vcpus));
+            clear_bit(v->vcpu_id, &info->vcpus);
+        }
+#endif
+        v->arch.shadow_table = pagetable_null();
+    }
+
+    ////
+    //// vcpu->arch.shadow_vtable
+    ////
+    if ( (shadow2_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) &&
+         v->arch.shadow_vtable )
+    {
+        // Q: why does this need to use (un)map_domain_page_*global* ?
+        //
+        sh2_unmap_domain_page_global(v->arch.shadow_vtable);
+        v->arch.shadow_vtable = NULL;
+    }
+}
+
+static void
+sh2_update_cr3(struct vcpu *v)
+/* Updates vcpu->arch.shadow_table after the guest has changed CR3.
+ * Paravirtual guests should set v->arch.guest_table (and guest_table_user,
+ * if appropriate).
+ * HVM guests should also set hvm_get_guest_cntl_reg(v, 3)...
+ */
+{
+    struct domain *d = v->domain;
+    mfn_t gmfn, smfn;
+#if GUEST_PAGING_LEVELS == 3
+    u32 guest_idx=0;
+#endif
+
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+    ASSERT(v->arch.shadow2);
+
+    ////
+    //// vcpu->arch.guest_table is already set
+    ////
+    
+#ifndef NDEBUG 
+    /* Double-check that the HVM code has sent us a sane guest_table */
+    if ( hvm_guest(v) )
+    {
+        gfn_t gfn;
+
+        ASSERT(shadow2_mode_external(d));
+
+        // Is paging enabled on this vcpu?
+        if ( shadow2_vcpu_mode_translate(v) )
+        {
+            gfn = _gfn(paddr_to_pfn(hvm_get_guest_ctrl_reg(v, 3)));
+            gmfn = vcpu_gfn_to_mfn(v, gfn);
+            ASSERT(valid_mfn(gmfn));
+            ASSERT(pagetable_get_pfn(v->arch.guest_table) == mfn_x(gmfn));
+        } 
+        else 
+        {
+            /* Paging disabled: guest_table points at (part of) p2m */
+#if SHADOW_PAGING_LEVELS != 3 /* in 3-on-4, guest-table is in slot 0 of p2m */
+            /* For everything else, they sould be the same */
+            ASSERT(v->arch.guest_table.pfn == d->arch.phys_table.pfn);
+#endif
+        }
+    }
+#endif
+
+    SHADOW2_PRINTK("d=%u v=%u guest_table=%05lx\n",
+                   d->domain_id, v->vcpu_id, 
+                   (unsigned long)pagetable_get_pfn(v->arch.guest_table));
+
+#if GUEST_PAGING_LEVELS == 4
+    if ( !(v->arch.flags & TF_kernel_mode) )
+        gmfn = pagetable_get_mfn(v->arch.guest_table_user);
+    else
+#endif
+        gmfn = pagetable_get_mfn(v->arch.guest_table);
+
+    sh2_detach_old_tables(v);
+
+    if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
+    {
+        ASSERT(v->arch.cr3 == 0);
+        return;
+    }
+
+    ////
+    //// vcpu->arch.guest_vtable
+    ////
+    if ( shadow2_mode_external(d) )
+    {
+#if GUEST_PAGING_LEVELS == 3
+        if ( shadow2_vcpu_mode_translate(v) ) 
+            /* Paging enabled: find where in the page the l3 table is */
+            guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3));
+        else
+            /* Paging disabled: l3 is at the start of a page (in the p2m) */ 
+            guest_idx = 0; 
+
+        // Ignore the low 2 bits of guest_idx -- they are really just
+        // cache control.
+        guest_idx &= ~3;
+        // XXX - why does this need a global map?
+        v->arch.guest_vtable =
+            (guest_l3e_t *)sh2_map_domain_page_global(gmfn) + guest_idx;
+#else
+        // XXX - why does this need a global map?
+        v->arch.guest_vtable = sh2_map_domain_page_global(gmfn);
+#endif
+    }
+    else
+    {
+#ifdef __x86_64__
+        v->arch.guest_vtable = __linear_l4_table;
+#elif GUEST_PAGING_LEVELS == 3
+        // XXX - why does this need a global map?
+        v->arch.guest_vtable = sh2_map_domain_page_global(gmfn);
+#else
+        v->arch.guest_vtable = __linear_l2_table;
+#endif
+    }
+
+#if 0
+    printk("%s %s %d gmfn=%05lx guest_vtable=%p\n",
+           __func__, __FILE__, __LINE__, gmfn, v->arch.guest_vtable);
+#endif
+
+    ////
+    //// vcpu->arch.shadow_table
+    ////
+    smfn = get_shadow_status(v, gmfn, PGC_SH2_guest_root_type);
+    if ( valid_mfn(smfn) )
+    {
+        /* Pull this root shadow to the front of the list of roots. */
+        list_del(&mfn_to_page(smfn)->list);
+        list_add(&mfn_to_page(smfn)->list, &d->arch.shadow2_toplevel_shadows);
+    }
+    else
+    {
+        /* This guest MFN is a pagetable.  Must revoke write access. */
+        if ( shadow2_remove_write_access(v, gmfn, GUEST_PAGING_LEVELS, 0) 
+             != 0 )
+            flush_tlb_mask(d->domain_dirty_cpumask); 
+        /* Make sure there's enough free shadow memory. */
+        shadow2_prealloc(d, SHADOW2_MAX_ORDER); 
+        /* Shadow the page. */
+        smfn = sh2_make_shadow(v, gmfn, PGC_SH2_guest_root_type);
+        list_add(&mfn_to_page(smfn)->list, &d->arch.shadow2_toplevel_shadows);
+    }
+    ASSERT(valid_mfn(smfn));
+    v->arch.shadow_table = pagetable_from_mfn(smfn);
+
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_EARLY_UNSHADOW
+    /* Once again OK to unhook entries from this table if we see fork/exit */
+    ASSERT(sh2_mfn_is_a_page_table(gmfn));
+    mfn_to_page(gmfn)->shadow2_flags &= ~SH2F_unhooked_mappings;
+#endif
+
+
+    ////
+    //// vcpu->arch.shadow_vtable
+    ////
+    if ( shadow2_mode_external(d) )
+    {
+#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
+        mfn_t adjusted_smfn = smfn;
+        u32 shadow_idx = shadow_l3_index(&adjusted_smfn, guest_idx);
+        // Q: why does this need to use (un)map_domain_page_*global* ?
+        v->arch.shadow_vtable =
+            (shadow_l3e_t *)sh2_map_domain_page_global(adjusted_smfn) +
+            shadow_idx;
+#else
+        // Q: why does this need to use (un)map_domain_page_*global* ?
+        v->arch.shadow_vtable = sh2_map_domain_page_global(smfn);
+#endif
+    }
+    else
+    {
+#if SHADOW_PAGING_LEVELS == 4
+        v->arch.shadow_vtable = __sh2_linear_l4_table;
+#elif GUEST_PAGING_LEVELS == 3
+        // XXX - why does this need a global map?
+        v->arch.shadow_vtable = sh2_map_domain_page_global(smfn);
+#else
+        v->arch.shadow_vtable = __sh2_linear_l2_table;
+#endif
+    }
+
+    ////
+    //// Take a ref to the new shadow table, and pin it.
+    ////
+    //
+    // This ref is logically "held" by v->arch.shadow_table entry itself.
+    // Release the old ref.
+    //
+#if GUEST_PAGING_LEVELS == 3
+    // PAE guests do not (necessarily) use an entire page for their
+    // 4-entry L3s, so we have to deal with them specially.
+    //
+    // XXX - might want to revisit this if/when we do multiple compilation for
+    //       HVM-vs-PV guests, as PAE PV guests could get away without doing
+    //       subshadows.
+    //
+    sh2_get_ref_l3_subshadow(v->arch.shadow_vtable, smfn);
+    sh2_pin_l3_subshadow(v->arch.shadow_vtable, smfn);
+#else
+    sh2_get_ref(smfn, 0);
+    sh2_pin(smfn);
+#endif
+
+#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3)
+    // PAE 3-on-3 shadows have to keep track of which vcpu's are using
+    // which l3 subshadow, in order handle the SHADOW2_SET_L3PAE_RECOPY
+    // case from validate_gl3e().  Search for SHADOW2_SET_L3PAE_RECOPY
+    // in the code for more info.
+    //
+    {
+        struct pae_l3_bookkeeping *info =
+            sl3p_to_info(v->arch.shadow_vtable);
+        ASSERT(!test_bit(v->vcpu_id, &info->vcpus));
+        set_bit(v->vcpu_id, &info->vcpus);
+    }
+#endif
+
+    debugtrace_printk("%s cr3 gmfn=%05lx smfn=%05lx\n",
+                      __func__, gmfn, smfn);
+
+    ///
+    /// v->arch.cr3 and, if appropriate, v->arch.hvm_vcpu.hw_cr3
+    ///
+    if ( shadow2_mode_external(d) )
+    {
+        ASSERT(hvm_guest(v));
+        make_cr3(v, pagetable_get_pfn(v->arch.monitor_table));
+
+#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2)
+#if SHADOW_PAGING_LEVELS != 3
+#error unexpected combination of GUEST and SHADOW paging levels
+#endif
+        /* 2-on-3: make a PAE l3 table that points at the four-page l2 */
+        {
+            mfn_t smfn = pagetable_get_mfn(v->arch.shadow_table);
+            int i;
+
+            ASSERT(v->arch.hvm_vcpu.hw_cr3 ==
+                   virt_to_maddr(v->arch.hvm_vcpu.hvm_lowmem_l3tab));
+            for (i = 0; i < 4; i++)
+            {
+                v->arch.hvm_vcpu.hvm_lowmem_l3tab[i] =
+                    shadow_l3e_from_mfn(_mfn(mfn_x(smfn)+i), _PAGE_PRESENT);
+            }
+        }
+#elif (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+        /* 3-on-3: copy the shadow l3 to slots that are below 4GB.
+         * If paging is disabled, clear l3e reserved bits; otherwise 
+         * remove entries that have reserved bits set. */
+        v->arch.hvm_vcpu.hw_cr3 =
+            hvm_pae_copy_root(v, v->arch.shadow_vtable, 
+                              !shadow2_vcpu_mode_translate(v));
+#else
+        /* 2-on-2 or 4-on-4: just put the shadow top-level into cr3 */
+        v->arch.hvm_vcpu.hw_cr3 =
+            pagetable_get_paddr(v->arch.shadow_table);
+#endif
+    }
+    else // not shadow2_mode_external...
+    {
+        /* We don't support PV except guest == shadow == config levels */
+        BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS);
+        make_cr3(v, pagetable_get_pfn(v->arch.shadow_table));
+    }
+
+    /* Fix up the linear pagetable mappings */
+    sh2_update_linear_entries(v);
+}
+
+
+/**************************************************************************/
+/* Functions to revoke guest rights */
+
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC
+static int sh2_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
+/* Look up this vaddr in the current shadow and see if it's a writeable
+ * mapping of this gmfn.  If so, remove it.  Returns 1 if it worked. */
+{
+    shadow_l1e_t sl1e, *sl1p;
+    shadow_l2e_t *sl2p;
+#if GUEST_PAGING_LEVELS >= 3
+    shadow_l3e_t *sl3p;
+#if GUEST_PAGING_LEVELS >= 4
+    shadow_l4e_t *sl4p;
+#endif
+#endif
+    mfn_t sl1mfn;
+
+
+    /* Carefully look in the shadow linear map for the l1e we expect */
+    if ( v->arch.shadow_vtable == NULL ) return 0;
+#if GUEST_PAGING_LEVELS >= 4
+    sl4p = sh2_linear_l4_table(v) + shadow_l4_linear_offset(vaddr);
+    if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) )
+        return 0;
+    sl3p = sh2_linear_l3_table(v) + shadow_l3_linear_offset(vaddr);
+    if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
+        return 0;
+#elif GUEST_PAGING_LEVELS == 3
+    sl3p = ((shadow_l3e_t *) v->arch.shadow_vtable) 
+        + shadow_l3_linear_offset(vaddr);
+    if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) )
+        return 0;
+#endif
+    sl2p = sh2_linear_l2_table(v) + shadow_l2_linear_offset(vaddr);
+    if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) )
+        return 0;
+    sl1p = sh2_linear_l1_table(v) + shadow_l1_linear_offset(vaddr);
+    sl1e = *sl1p;
+    if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
+          != (_PAGE_PRESENT|_PAGE_RW))
+         || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
+        return 0;
+
+    /* Found it!  Need to remove its write permissions. */
+    sl1mfn = shadow_l2e_get_mfn(*sl2p);
+    sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
+    shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
+    return 1;
+}
+#endif
+
+int sh2_remove_write_access(struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn)
+/* Excises all writeable mappings to readonly_mfn from this l1 shadow table */
+{
+    shadow_l1e_t *sl1e;
+    int done = 0;
+    int flags;
+    
+    SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, done, 
+    {
+        flags = shadow_l1e_get_flags(*sl1e);
+        if ( (flags & _PAGE_PRESENT) 
+             && (flags & _PAGE_RW) 
+             && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) )
+        {
+            shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
+            if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info
+                  & PGT_count_mask) == 0 )
+                /* This breaks us cleanly out of the FOREACH macro */
+                done = 1;
+        }
+    });
+    return done;
+}
+
+
+int sh2_remove_all_mappings(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn)
+/* Excises all mappings to guest frame from this shadow l1 table */
+{
+    shadow_l1e_t *sl1e;
+    int done = 0;
+    int flags;
+    
+    SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, done, 
+    {
+        flags = shadow_l1e_get_flags(*sl1e);
+        if ( (flags & _PAGE_PRESENT) 
+             && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) )
+        {
+            shadow_set_l1e(v, sl1e, shadow_l1e_empty(), sl1mfn);
+            if ( (mfn_to_page(target_mfn)->count_info & PGC_count_mask) == 0 )
+                /* This breaks us cleanly out of the FOREACH macro */
+                done = 1;
+        }
+    });
+    return done;
+}
+
+/**************************************************************************/
+/* Functions to excise all pointers to shadows from higher-level shadows. */
+
+void sh2_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn)
+/* Blank out a single shadow entry */
+{
+    switch (mfn_to_page(smfn)->count_info & PGC_SH2_type_mask) 
+    {
+    case PGC_SH2_l1_shadow:
+        shadow_set_l1e(v, ep, shadow_l1e_empty(), smfn); break;
+    case PGC_SH2_l2_shadow:
+#if GUEST_PAGING_LEVELS == 3
+    case PGC_SH2_l2h_shadow:
+#endif
+        shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break;
+#if GUEST_PAGING_LEVELS >= 3
+    case PGC_SH2_l3_shadow:
+        shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break;
+#if GUEST_PAGING_LEVELS >= 4
+    case PGC_SH2_l4_shadow:
+        shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break;
+#endif
+#endif
+    default: BUG(); /* Called with the wrong kind of shadow. */
+    }
+}
+
+int sh2_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn)
+/* Remove all mappings of this l1 shadow from this l2 shadow */
+{
+    shadow_l2e_t *sl2e;
+    int done = 0;
+    int flags;
+#if GUEST_PAGING_LEVELS != 4
+    int xen_mappings = !shadow2_mode_external(v->domain);
+#endif
+    
+    SHADOW2_FOREACH_L2E(sl2mfn, sl2e, 0, done, xen_mappings, 
+    {
+        flags = shadow_l2e_get_flags(*sl2e);
+        if ( (flags & _PAGE_PRESENT) 
+             && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) )
+        {
+            shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn);
+            if ( (mfn_to_page(sl1mfn)->count_info & PGC_SH2_type_mask) == 0 )
+                /* This breaks us cleanly out of the FOREACH macro */
+                done = 1;
+        }
+    });
+    return done;
+}
+
+#if GUEST_PAGING_LEVELS >= 3
+int sh2_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn)
+/* Remove all mappings of this l2 shadow from this l3 shadow */
+{
+    shadow_l3e_t *sl3e;
+    int done = 0;
+    int flags;
+    
+    SHADOW2_FOREACH_L3E(sl3mfn, sl3e, 0, done, 
+    {
+        flags = shadow_l3e_get_flags(*sl3e);
+        if ( (flags & _PAGE_PRESENT) 
+             && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) )
+        {
+            shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn);
+            if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH2_type_mask) == 0 )
+                /* This breaks us cleanly out of the FOREACH macro */
+                done = 1;
+        }
+    });
+    return done;
+}
+
+#if GUEST_PAGING_LEVELS >= 4
+int sh2_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn)
+/* Remove all mappings of this l3 shadow from this l4 shadow */
+{
+    shadow_l4e_t *sl4e;
+    int done = 0;
+    int flags, xen_mappings = !shadow2_mode_external(v->domain);
+    
+    SHADOW2_FOREACH_L4E(sl4mfn, sl4e, 0, done, xen_mappings,
+    {
+        flags = shadow_l4e_get_flags(*sl4e);
+        if ( (flags & _PAGE_PRESENT) 
+             && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) )
+        {
+            shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn);
+            if ( (mfn_to_page(sl3mfn)->count_info & PGC_SH2_type_mask) == 0 )
+                /* This breaks us cleanly out of the FOREACH macro */
+                done = 1;
+        }
+    });
+    return done;
+}
+#endif /* 64bit guest */ 
+#endif /* PAE guest */
+
+/**************************************************************************/
+/* Handling HVM guest writes to pagetables  */
+
+/* Check that the user is allowed to perform this write. 
+ * Returns a mapped pointer to write to, and the mfn it's on,
+ * or NULL for error. */
+static inline void * emulate_map_dest(struct vcpu *v,
+                                      unsigned long vaddr,
+                                      struct x86_emulate_ctxt *ctxt,
+                                      mfn_t *mfnp)
+{
+    walk_t gw;
+    u32 flags;
+    gfn_t gfn;
+    mfn_t mfn;
+
+    guest_walk_tables(v, vaddr, &gw, 1);
+    flags = accumulate_guest_flags(&gw);
+    gfn = guest_l1e_get_gfn(gw.eff_l1e);
+    mfn = vcpu_gfn_to_mfn(v, gfn);
+    sh2_audit_gw(v, &gw);
+    unmap_walk(v, &gw);
+
+    if ( !(flags & _PAGE_PRESENT) 
+         || !(flags & _PAGE_RW) 
+         || (!(flags & _PAGE_USER) && ring_3(ctxt->regs)) )
+    {
+        /* This write would have faulted even on bare metal */
+        v->arch.shadow2_propagate_fault = 1;
+        return NULL;
+    }
+    
+    if ( !valid_mfn(mfn) )
+    {
+        /* Attempted a write to a bad gfn.  This should never happen:
+         * after all, we're here because this write is to a page table. */
+        BUG();
+    }
+
+    ASSERT(sh2_mfn_is_a_page_table(mfn));
+    *mfnp = mfn;
+    return sh2_map_domain_page(mfn) + (vaddr & ~PAGE_MASK);
+}
+
+int
+sh2_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
+                      u32 bytes, struct x86_emulate_ctxt *ctxt)
+{
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+    while ( bytes > 0 )
+    {
+        mfn_t mfn;
+        int bytes_on_page;
+        void *addr;
+
+        bytes_on_page = PAGE_SIZE - (vaddr & ~PAGE_MASK);
+        if ( bytes_on_page > bytes )
+            bytes_on_page = bytes;
+
+        if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
+            return X86EMUL_PROPAGATE_FAULT;
+        memcpy(addr, src, bytes_on_page);
+        shadow2_validate_guest_pt_write(v, mfn, addr, bytes_on_page);
+        bytes -= bytes_on_page;
+        /* If we are writing zeros to this page, might want to unshadow */
+        if ( *(u8 *)addr == 0 )
+            check_for_early_unshadow(v, mfn);
+        sh2_unmap_domain_page(addr);
+    }
+    shadow2_audit_tables(v);
+    return X86EMUL_CONTINUE;
+}
+
+int
+sh2_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr, 
+                        unsigned long old, unsigned long new,
+                        unsigned int bytes, struct x86_emulate_ctxt *ctxt)
+{
+    mfn_t mfn;
+    void *addr;
+    unsigned long prev;
+    int rv = X86EMUL_CONTINUE;
+
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+    ASSERT(bytes <= sizeof (unsigned long));
+
+    if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
+        return X86EMUL_PROPAGATE_FAULT;
+
+    switch (bytes) 
+    {
+    case 1: prev = cmpxchg(((u8 *)addr), old, new);  break;
+    case 2: prev = cmpxchg(((u16 *)addr), old, new); break;
+    case 4: prev = cmpxchg(((u32 *)addr), old, new); break;
+    case 8: prev = cmpxchg(((u64 *)addr), old, new); break;
+    default:
+        SHADOW2_PRINTK("cmpxchg of size %i is not supported\n", bytes);
+        prev = ~old;
+    }
+
+    if ( (prev == old)  )
+        shadow2_validate_guest_pt_write(v, mfn, addr, bytes);
+    else
+        rv = X86EMUL_CMPXCHG_FAILED;
+
+    SHADOW2_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
+                  " wanted %#lx now %#lx bytes %u\n",
+                  vaddr, prev, old, new, *(unsigned long *)addr, bytes);
+
+    /* If we are writing zeros to this page, might want to unshadow */
+    if ( *(u8 *)addr == 0 )
+        check_for_early_unshadow(v, mfn);
+
+    sh2_unmap_domain_page(addr);
+    shadow2_audit_tables(v);
+    check_for_early_unshadow(v, mfn);
+    return rv;
+}
+
+int
+sh2_x86_emulate_cmpxchg8b(struct vcpu *v, unsigned long vaddr, 
+                          unsigned long old_lo, unsigned long old_hi,
+                          unsigned long new_lo, unsigned long new_hi,
+                          struct x86_emulate_ctxt *ctxt)
+{
+    mfn_t mfn;
+    void *addr;
+    u64 old, new, prev;
+    int rv = X86EMUL_CONTINUE;
+
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+
+    if ( (addr = emulate_map_dest(v, vaddr, ctxt, &mfn)) == NULL )
+        return X86EMUL_PROPAGATE_FAULT;
+
+    old = (((u64) old_hi) << 32) | (u64) old_lo;
+    new = (((u64) new_hi) << 32) | (u64) new_lo;
+    prev = cmpxchg(((u64 *)addr), old, new);
+
+    if ( (prev == old)  )
+        shadow2_validate_guest_pt_write(v, mfn, addr, 8);
+    else
+        rv = X86EMUL_CMPXCHG_FAILED;
+
+    /* If we are writing zeros to this page, might want to unshadow */
+    if ( *(u8 *)addr == 0 )
+        check_for_early_unshadow(v, mfn);
+
+    sh2_unmap_domain_page(addr);
+    shadow2_audit_tables(v);
+    check_for_early_unshadow(v, mfn);
+    return rv;
+}
+
+
+/**************************************************************************/
+/* Audit tools */
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES
+
+#define AUDIT_FAIL(_level, _fmt, _a...) do {                               \
+    printk("Shadow2 %u-on-%u audit failed at level %i, index %i\n"         \
+           "gl" #_level "mfn = %" SH2_PRI_mfn                              \
+           " sl" #_level "mfn = %" SH2_PRI_mfn                             \
+           " &gl" #_level "e = %p &sl" #_level "e = %p"                    \
+           " gl" #_level "e = %" SH2_PRI_gpte                              \
+           " sl" #_level "e = %" SH2_PRI_pte "\nError: " _fmt "\n",        \
+           GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS,                      \
+           _level, guest_index(gl ## _level ## e),                         \
+           mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn),         \
+           gl ## _level ## e, sl ## _level ## e,                           \
+           gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
+           ##_a);                                                          \
+    BUG();                                                                 \
+    done = 1;                                                              \
+} while (0)
+
+
+static char * sh2_audit_flags(struct vcpu *v, int level,
+                              int gflags, int sflags) 
+/* Common code for auditing flag bits */
+{
+    if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) )
+        return "shadow is present but guest is not present";
+    if ( (sflags & _PAGE_GLOBAL) && !hvm_guest(v) ) 
+        return "global bit set in PV shadow";
+    if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE)))
+         && ((sflags & _PAGE_DIRTY) && !(gflags & _PAGE_DIRTY)) ) 
+        return "dirty bit not propagated";
+    if ( level == 2 && (sflags & _PAGE_PSE) )
+        return "PS bit set in shadow";
+#if SHADOW_PAGING_LEVELS == 3
+    if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */
+#endif
+    if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) ) 
+        return "user/supervisor bit does not match";
+    if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) ) 
+        return "NX bit does not match";
+    if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) ) 
+        return "shadow grants write access but guest does not";
+    if ( (sflags & _PAGE_ACCESSED) && !(gflags & _PAGE_ACCESSED) ) 
+        return "accessed bit not propagated";
+    return NULL;
+}
+
+static inline mfn_t
+audit_gfn_to_mfn(struct vcpu *v, gfn_t gfn, mfn_t gmfn)
+/* Convert this gfn to an mfn in the manner appropriate for the
+ * guest pagetable it's used in (gmfn) */ 
+{
+    if ( !shadow2_mode_translate(v->domain) )
+        return _mfn(gfn_x(gfn));
+    
+    if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_type_mask)
+         != PGT_writable_page ) 
+        return _mfn(gfn_x(gfn)); /* This is a paging-disabled shadow */
+    else 
+        return sh2_gfn_to_mfn(v->domain, gfn_x(gfn));
+} 
+
+
+int sh2_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
+{
+    guest_l1e_t *gl1e, *gp;
+    shadow_l1e_t *sl1e;
+    mfn_t mfn, gmfn, gl1mfn;
+    gfn_t gfn;
+    char *s;
+    int done = 0;
+
+    /* Follow the backpointer */
+    gl1mfn = _mfn(mfn_to_page(sl1mfn)->u.inuse.type_info);
+    gl1e = gp = sh2_map_domain_page(gl1mfn);
+    SHADOW2_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
+
+        s = sh2_audit_flags(v, 1, guest_l1e_get_flags(*gl1e),
+                            shadow_l1e_get_flags(*sl1e));
+        if ( s ) AUDIT_FAIL(1, "%s", s);
+
+        if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS )
+        {
+            gfn = guest_l1e_get_gfn(*gl1e);
+            mfn = shadow_l1e_get_mfn(*sl1e);
+            gmfn = audit_gfn_to_mfn(v, gfn, gl1mfn);
+            if ( mfn_x(gmfn) != mfn_x(mfn) )
+                AUDIT_FAIL(1, "bad translation: gfn %" SH2_PRI_gfn
+                           " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n",
+                           gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
+        }
+    });
+    sh2_unmap_domain_page(gp);
+    return done;
+}
+
+int sh2_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x)
+{
+    guest_l1e_t *gl1e, e;
+    shadow_l1e_t *sl1e;
+    mfn_t gl1mfn = _mfn(INVALID_MFN);
+    int f;
+    int done = 0;
+
+    /* fl1 has no useful backpointer: all we can check are flags */
+    e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */
+    SHADOW2_FOREACH_L1E(sl1mfn, sl1e, 0, done, {
+        f = shadow_l1e_get_flags(*sl1e);
+        f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2);
+        if ( !(f == 0 
+               || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
+                        _PAGE_ACCESSED|_PAGE_DIRTY) 
+               || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY)) )
+            AUDIT_FAIL(1, "fl1e has bad flags");
+    });
+    return 0;
+}
+
+int sh2_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x)
+{
+    guest_l2e_t *gl2e, *gp;
+    shadow_l2e_t *sl2e;
+    mfn_t mfn, gmfn, gl2mfn;
+    gfn_t gfn;
+    char *s;
+    int done = 0;
+#if GUEST_PAGING_LEVELS != 4
+    int xen_mappings = !shadow2_mode_external(v->domain);
+#endif
+
+    /* Follow the backpointer */
+    gl2mfn = _mfn(mfn_to_page(sl2mfn)->u.inuse.type_info);
+    gl2e = gp = sh2_map_domain_page(gl2mfn);
+    SHADOW2_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, xen_mappings, {
+
+        s = sh2_audit_flags(v, 2, guest_l2e_get_flags(*gl2e),
+                            shadow_l2e_get_flags(*sl2e));
+        if ( s ) AUDIT_FAIL(2, "%s", s);
+
+        if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS )
+        {
+            gfn = guest_l2e_get_gfn(*gl2e);
+            mfn = shadow_l2e_get_mfn(*sl2e);
+            gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE)  
+                ? get_fl1_shadow_status(v, gfn)
+                : get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl2mfn), 
+                                    PGC_SH2_l1_shadow);
+            if ( mfn_x(gmfn) != mfn_x(mfn) )
+                AUDIT_FAIL(2, "bad translation: gfn %" SH2_PRI_gfn
+                           " (--> %" SH2_PRI_mfn ")"
+                           " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n",
+                           gfn_x(gfn), 
+                           (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0
+                           : mfn_x(audit_gfn_to_mfn(v, gfn, gl2mfn)),
+                           mfn_x(gmfn), mfn_x(mfn));
+        }
+    });
+    sh2_unmap_domain_page(gp);
+    return 0;
+}
+
+#if GUEST_PAGING_LEVELS >= 3
+int sh2_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x)
+{
+    guest_l3e_t *gl3e, *gp;
+    shadow_l3e_t *sl3e;
+    mfn_t mfn, gmfn, gl3mfn;
+    gfn_t gfn;
+    char *s;
+    int done = 0;
+
+    /* Follow the backpointer */
+    gl3mfn = _mfn(mfn_to_page(sl3mfn)->u.inuse.type_info);
+    gl3e = gp = sh2_map_domain_page(gl3mfn);
+    SHADOW2_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
+
+        s = sh2_audit_flags(v, 3, guest_l3e_get_flags(*gl3e),
+                            shadow_l3e_get_flags(*sl3e));
+        if ( s ) AUDIT_FAIL(3, "%s", s);
+
+        if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS )
+        {
+            gfn = guest_l3e_get_gfn(*gl3e);
+            mfn = shadow_l3e_get_mfn(*sl3e);
+            gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl3mfn), 
+                                     (GUEST_PAGING_LEVELS == 3 
+                                      && !shadow2_mode_external(v->domain)
+                                      && (guest_index(gl3e) % 4) == 3)
+                                     ? PGC_SH2_l2h_pae_shadow
+                                     : PGC_SH2_l2_shadow);
+            if ( mfn_x(gmfn) != mfn_x(mfn) )
+                AUDIT_FAIL(3, "bad translation: gfn %" SH2_PRI_gfn
+                           " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n",
+                           gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
+        }
+    });
+    sh2_unmap_domain_page(gp);
+    return 0;
+}
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+#if GUEST_PAGING_LEVELS >= 4
+int sh2_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x)
+{
+    guest_l4e_t *gl4e, *gp;
+    shadow_l4e_t *sl4e;
+    mfn_t mfn, gmfn, gl4mfn;
+    gfn_t gfn;
+    char *s;
+    int done = 0;
+    int xen_mappings = !shadow2_mode_external(v->domain);
+
+    /* Follow the backpointer */
+    gl4mfn = _mfn(mfn_to_page(sl4mfn)->u.inuse.type_info);
+    gl4e = gp = sh2_map_domain_page(gl4mfn);
+    SHADOW2_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, xen_mappings,
+    {
+        s = sh2_audit_flags(v, 4, guest_l4e_get_flags(*gl4e),
+                            shadow_l4e_get_flags(*sl4e));
+        if ( s ) AUDIT_FAIL(4, "%s", s);
+
+        if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_MFNS )
+        {
+            gfn = guest_l4e_get_gfn(*gl4e);
+            mfn = shadow_l4e_get_mfn(*sl4e);
+            gmfn = get_shadow_status(v, audit_gfn_to_mfn(v, gfn, gl4mfn), 
+                                     PGC_SH2_l3_shadow);
+            if ( mfn_x(gmfn) != mfn_x(mfn) )
+                AUDIT_FAIL(4, "bad translation: gfn %" SH2_PRI_gfn
+                           " --> %" SH2_PRI_mfn " != mfn %" SH2_PRI_mfn "\n",
+                           gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn));
+        }
+    });
+    sh2_unmap_domain_page(gp);
+    return 0;
+}
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+
+#undef AUDIT_FAIL
+
+#endif /* Audit code */
+
+/**************************************************************************/
+/* Entry points into this mode of the shadow code.
+ * This will all be mangled by the preprocessor to uniquify everything. */
+struct shadow2_entry_points shadow2_entry = {
+    .page_fault             = sh2_page_fault, 
+    .invlpg                 = sh2_invlpg,
+    .gva_to_gpa             = sh2_gva_to_gpa,
+    .gva_to_gfn             = sh2_gva_to_gfn,
+    .update_cr3             = sh2_update_cr3,
+    .map_and_validate_gl1e  = sh2_map_and_validate_gl1e,
+    .map_and_validate_gl2e  = sh2_map_and_validate_gl2e,
+    .map_and_validate_gl2he = sh2_map_and_validate_gl2he,
+    .map_and_validate_gl3e  = sh2_map_and_validate_gl3e,
+    .map_and_validate_gl4e  = sh2_map_and_validate_gl4e,
+    .detach_old_tables      = sh2_detach_old_tables,
+    .x86_emulate_write      = sh2_x86_emulate_write,
+    .x86_emulate_cmpxchg    = sh2_x86_emulate_cmpxchg,
+    .x86_emulate_cmpxchg8b  = sh2_x86_emulate_cmpxchg8b,
+    .make_monitor_table     = sh2_make_monitor_table,
+    .destroy_monitor_table  = sh2_destroy_monitor_table,
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC
+    .guess_wrmap            = sh2_guess_wrmap,
+#endif
+    .guest_levels           = GUEST_PAGING_LEVELS,
+    .shadow_levels          = SHADOW_PAGING_LEVELS,
+};
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End: 
+ */
diff --git a/xen/arch/x86/shadow32.c b/xen/arch/x86/shadow32.c
deleted file mode 100644 (file)
index 3926697..0000000
+++ /dev/null
@@ -1,3782 +0,0 @@
-/******************************************************************************
- * arch/x86/shadow.c
- * 
- * Copyright (c) 2005 Michael A Fetterman
- * Based on an earlier implementation by Ian Pratt et al
- * 
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/mm.h>
-#include <xen/domain_page.h>
-#include <asm/shadow.h>
-#include <asm/page.h>
-#include <xen/event.h>
-#include <xen/sched.h>
-#include <xen/trace.h>
-#include <xen/guest_access.h>
-
-#define MFN_PINNED(_x) (mfn_to_page(_x)->u.inuse.type_info & PGT_pinned)
-#define va_to_l1mfn(_ed, _va) \
-    (l2e_get_pfn(linear_l2_table(_ed)[_va>>L2_PAGETABLE_SHIFT]))
-
-static void shadow_free_snapshot(struct domain *d,
-                                 struct out_of_sync_entry *entry);
-static void remove_out_of_sync_entries(struct domain *d, unsigned long smfn);
-static void free_writable_pte_predictions(struct domain *d);
-
-#if SHADOW_DEBUG
-static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn);
-#endif
-
-static int alloc_p2m_table(struct domain *d);
-static void free_p2m_table(struct domain *d);
-
-/********
-
-There's a per-domain shadow table spin lock which works fine for SMP
-hosts. We don't have to worry about interrupts as no shadow operations
-happen in an interrupt context. It's probably not quite ready for SMP
-guest operation as we have to worry about synchonisation between gpte
-and spte updates. Its possible that this might only happen in a
-hypercall context, in which case we'll probably at have a per-domain
-hypercall lock anyhow (at least initially).
-
-********/
-
-static inline int
-shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn,
-               unsigned long new_type)
-{
-    struct page_info *page = mfn_to_page(gmfn);
-    int pinned = 0, okay = 1;
-
-    if ( page_out_of_sync(page) )
-    {
-        // Don't know how long ago this snapshot was taken.
-        // Can't trust it to be recent enough.
-        //
-        __shadow_sync_mfn(d, gmfn);
-    }
-
-    if ( !shadow_mode_refcounts(d) )
-        return 1;
-
-    if ( unlikely(page_is_page_table(page)) )
-        return 1;
-
-    FSH_LOG("%s: gpfn=%lx gmfn=%lx nt=%08lx", __func__, gpfn, gmfn, new_type);
-
-    if ( !shadow_remove_all_write_access(d, gpfn, gmfn) )
-    {
-        FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%lx gmfn=%lx",
-                __func__, gpfn, gmfn);
-#if 1 || defined(LIVE_DANGEROUSLY)
-        set_bit(_PGC_page_table, &page->count_info);
-        return 1;
-#endif
-        return 0;
-        
-    }
-
-    // To convert this page to use as a page table, the writable count
-    // should now be zero.  Test this by grabbing the page as an page table,
-    // and then immediately releasing.  This will also deal with any
-    // necessary TLB flushing issues for us.
-    //
-    // The cruft here about pinning doesn't really work right.  This
-    // needs rethinking/rewriting...  Need to gracefully deal with the
-    // TLB flushes required when promoting a writable page, and also deal
-    // with any outstanding (external) writable refs to this page (by
-    // refusing to promote it).  The pinning headache complicates this
-    // code -- it would all get much simpler if we stop using
-    // shadow_lock() and move the shadow code to BIGLOCK().
-    //
-    if ( unlikely(!get_page(page, d)) )
-        BUG(); // XXX -- needs more thought for a graceful failure
-    if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) )
-    {
-        pinned = 1;
-        put_page_and_type(page);
-    }
-    if ( get_page_type(page, PGT_base_page_table) )
-    {
-        set_bit(_PGC_page_table, &page->count_info);
-        put_page_type(page);
-    }
-    else
-    {
-        printk("shadow_promote: get_page_type failed "
-               "dom%d gpfn=%lx gmfn=%lx t=%08lx\n",
-               d->domain_id, gpfn, gmfn, new_type);
-        okay = 0;
-    }
-
-    // Now put the type back to writable...
-    if ( unlikely(!get_page_type(page, PGT_writable_page)) )
-        BUG(); // XXX -- needs more thought for a graceful failure
-    if ( unlikely(pinned) )
-    {
-        if ( unlikely(test_and_set_bit(_PGT_pinned,
-                                       &page->u.inuse.type_info)) )
-            BUG(); // hmm... someone pinned this again?
-    }
-    else
-        put_page_and_type(page);
-
-    return okay;
-}
-
-static inline void
-shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
-{
-    if ( !shadow_mode_refcounts(d) )
-        return;
-
-    ASSERT(mfn_to_page(gmfn)->count_info & PGC_page_table);
-
-    if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none )
-    {
-        clear_bit(_PGC_page_table, &mfn_to_page(gmfn)->count_info);
-
-        if ( page_out_of_sync(mfn_to_page(gmfn)) )
-        {
-            remove_out_of_sync_entries(d, gmfn);
-        }
-    }
-}
-
-/*
- * Things in shadow mode that collect get_page() refs to the domain's
- * pages are:
- * - PGC_allocated takes a gen count, just like normal.
- * - A writable page can be pinned (paravirtualized guests may consider
- *   these pages to be L1s or L2s, and don't know the difference).
- *   Pinning a page takes a gen count (but, for domains in shadow mode,
- *   it *doesn't* take a type count)
- * - CR3 grabs a ref to whatever it points at, just like normal.
- * - Shadow mode grabs an initial gen count for itself, as a placehold
- *   for whatever references will exist.
- * - Shadow PTEs that point to a page take a gen count, just like regular
- *   PTEs.  However, they don't get a type count, as get_page_type() is
- *   hardwired to keep writable pages' counts at 1 for domains in shadow
- *   mode.
- * - Whenever we shadow a page, the entry in the shadow hash grabs a
- *   general ref to the page.
- * - Whenever a page goes out of sync, the out of sync entry grabs a
- *   general ref to the page.
- */
-/*
- * page_info fields for pages allocated as shadow pages:
- *
- * All 32 bits of count_info are a simple count of refs to this shadow
- * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table),
- * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync
- * references.
- *
- * u.inuse._domain is left NULL, to prevent accidently allow some random
- * domain from gaining permissions to map this page.
- *
- * u.inuse.type_info & PGT_type_mask remembers what kind of page is being
- * shadowed.
- * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed.
- * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow
- * is currently exists because this is a shadow of a root page, and we
- * don't want to let those disappear just because no CR3 is currently pointing
- * at it.
- *
- * tlbflush_timestamp holds a min & max index of valid page table entries
- * within the shadow page.
- */
-
-static inline unsigned long
-alloc_shadow_page(struct domain *d,
-                  unsigned long gpfn, unsigned long gmfn,
-                  u32 psh_type)
-{
-    struct page_info *page;
-    unsigned long smfn;
-    int pin = 0;
-    void *l1;
-
-    // Currently, we only keep pre-zero'ed pages around for use as L1's...
-    // This will change.  Soon.
-    //
-    if ( psh_type == PGT_l1_shadow )
-    {
-        if ( !list_empty(&d->arch.free_shadow_frames) )
-        {
-            struct list_head *entry = d->arch.free_shadow_frames.next;
-            page = list_entry(entry, struct page_info, list);
-            list_del(entry);
-            perfc_decr(free_l1_pages);
-        }
-        else
-        {
-            page = alloc_domheap_page(NULL);
-            l1 = map_domain_page(page_to_mfn(page));
-            memset(l1, 0, PAGE_SIZE);
-            unmap_domain_page(l1);
-        }
-    }
-    else
-        page = alloc_domheap_page(NULL);
-
-    if ( unlikely(page == NULL) )
-    {
-        printk("Couldn't alloc shadow page! dom%d count=%d\n",
-               d->domain_id, d->arch.shadow_page_count);
-        printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n",
-               perfc_value(shadow_l1_pages), 
-               perfc_value(shadow_l2_pages),
-               perfc_value(hl2_table_pages),
-               perfc_value(snapshot_pages));
-        /* XXX FIXME: try a shadow flush to free up some memory. */
-        domain_crash_synchronous();
-    }
-
-    smfn = page_to_mfn(page);
-
-    ASSERT( (gmfn & ~PGT_mfn_mask) == 0 );
-    page->u.inuse.type_info = psh_type | gmfn;
-    page->count_info = 0;
-    page->tlbflush_timestamp = 0;
-
-    switch ( psh_type )
-    {
-    case PGT_l1_shadow:
-        if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
-            goto fail;
-        perfc_incr(shadow_l1_pages);
-        d->arch.shadow_page_count++;
-        break;
-
-    case PGT_l2_shadow:
-        if ( !shadow_promote(d, gpfn, gmfn, psh_type) )
-            goto fail;
-        perfc_incr(shadow_l2_pages);
-        d->arch.shadow_page_count++;
-        if ( PGT_l2_page_table == PGT_root_page_table )
-            pin = 1;
-
-        break;
-
-    case PGT_hl2_shadow:
-        // Treat an hl2 as an L1 for purposes of promotion.
-        // For external mode domains, treat them as an L2 for purposes of
-        // pinning.
-        //
-        if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) )
-            goto fail;
-        perfc_incr(hl2_table_pages);
-        d->arch.hl2_page_count++;
-        if ( shadow_mode_external(d) &&
-             (PGT_l2_page_table == PGT_root_page_table) )
-            pin = 1;
-
-        break;
-
-    case PGT_snapshot:
-        perfc_incr(snapshot_pages);
-        d->arch.snapshot_page_count++;
-        break;
-
-    default:
-        printk("Alloc shadow weird page type type=%08x\n", psh_type);
-        BUG();
-        break;
-    }
-
-    // Don't add a new shadow of something that already has a snapshot.
-    //
-    ASSERT( (psh_type == PGT_snapshot) || !mfn_out_of_sync(gmfn) );
-
-    set_shadow_status(d, gpfn, gmfn, smfn, psh_type, 0);
-
-    if ( pin )
-        shadow_pin(smfn);
-
-    return smfn;
-
-  fail:
-    FSH_LOG("promotion of pfn=%lx mfn=%lx failed!  external gnttab refs?",
-            gpfn, gmfn);
-    free_domheap_page(page);
-    return 0;
-}
-
-static void inline
-free_shadow_l1_table(struct domain *d, unsigned long smfn)
-{
-    l1_pgentry_t *pl1e = map_domain_page(smfn);
-    int i;
-    struct page_info *spage = mfn_to_page(smfn);
-    u32 min_max = spage->tlbflush_timestamp;
-    int min = SHADOW_MIN(min_max);
-    int max = SHADOW_MAX(min_max);
-
-    for ( i = min; i <= max; i++ )
-    {
-        shadow_put_page_from_l1e(pl1e[i], d);
-        pl1e[i] = l1e_empty();
-    }
-
-    unmap_domain_page(pl1e);
-}
-
-static void inline
-free_shadow_hl2_table(struct domain *d, unsigned long smfn)
-{
-    l1_pgentry_t *hl2 = map_domain_page(smfn);
-    int i, limit;
-
-    SH_VVLOG("%s: smfn=%lx freed", __func__, smfn);
-
-    if ( shadow_mode_external(d) )
-        limit = L2_PAGETABLE_ENTRIES;
-    else
-        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-
-    for ( i = 0; i < limit; i++ )
-    {
-        if ( l1e_get_flags(hl2[i]) & _PAGE_PRESENT )
-            put_page(mfn_to_page(l1e_get_pfn(hl2[i])));
-    }
-
-    unmap_domain_page(hl2);
-}
-
-static void inline
-free_shadow_l2_table(struct domain *d, unsigned long smfn, unsigned int type)
-{
-    l2_pgentry_t *pl2e = map_domain_page(smfn);
-    int i, external = shadow_mode_external(d);
-
-    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
-        if ( external || is_guest_l2_slot(type, i) )
-            if ( l2e_get_flags(pl2e[i]) & _PAGE_PRESENT )
-                put_shadow_ref(l2e_get_pfn(pl2e[i]));
-
-    if ( (PGT_base_page_table == PGT_l2_page_table) &&
-         shadow_mode_translate(d) && !external )
-    {
-        // free the ref to the hl2
-        //
-        put_shadow_ref(l2e_get_pfn(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]));
-    }
-
-    unmap_domain_page(pl2e);
-}
-
-void free_shadow_page(unsigned long smfn)
-{
-    struct page_info *page = mfn_to_page(smfn);
-    unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask;
-    struct domain *d = page_get_owner(mfn_to_page(gmfn));
-    unsigned long gpfn = mfn_to_gmfn(d, gmfn);
-    unsigned long type = page->u.inuse.type_info & PGT_type_mask;
-
-    SH_VVLOG("%s: free'ing smfn=%lx", __func__, smfn);
-
-    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
-
-    delete_shadow_status(d, gpfn, gmfn, type, 0);
-
-    switch ( type )
-    {
-    case PGT_l1_shadow:
-        perfc_decr(shadow_l1_pages);
-        shadow_demote(d, gpfn, gmfn);
-        free_shadow_l1_table(d, smfn);
-        d->arch.shadow_page_count--;
-        break;
-
-    case PGT_l2_shadow:
-        perfc_decr(shadow_l2_pages);
-        shadow_demote(d, gpfn, gmfn);
-        free_shadow_l2_table(d, smfn, page->u.inuse.type_info);
-        d->arch.shadow_page_count--;
-        break;
-
-    case PGT_hl2_shadow:
-        perfc_decr(hl2_table_pages);
-        shadow_demote(d, gpfn, gmfn);
-        free_shadow_hl2_table(d, smfn);
-        d->arch.hl2_page_count--;
-        break;
-
-    case PGT_snapshot:
-        perfc_decr(snapshot_pages);
-        d->arch.snapshot_page_count--;
-        break;
-
-    default:
-        printk("Free shadow weird page type mfn=%lx type=%" PRtype_info "\n",
-               page_to_mfn(page), page->u.inuse.type_info);
-        break;
-    }
-
-    // No TLB flushes are needed the next time this page gets allocated.
-    //
-    page->tlbflush_timestamp = 0;
-    page->u.free.cpumask     = CPU_MASK_NONE;
-
-    if ( type == PGT_l1_shadow )
-    {
-        list_add(&page->list, &d->arch.free_shadow_frames);
-        perfc_incr(free_l1_pages);
-    }
-    else
-        free_domheap_page(page);
-}
-
-void
-remove_shadow(struct domain *d, unsigned long gpfn, u32 stype)
-{
-    unsigned long smfn;
-
-    //printk("%s(gpfn=%lx, type=%x)\n", __func__, gpfn, stype);
-
-    shadow_lock(d);
-
-    while ( stype >= PGT_l1_shadow )
-    {
-        smfn = __shadow_status(d, gpfn, stype);
-        if ( smfn && MFN_PINNED(smfn) )
-            shadow_unpin(smfn);
-        stype -= PGT_l1_shadow;
-    }
-
-    shadow_unlock(d);
-}
-
-static void inline
-release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
-{
-    struct page_info *page;
-
-    page = mfn_to_page(entry->gmfn);
-
-    // Decrement ref count of guest & shadow pages
-    //
-    put_page(page);
-
-    // Only use entries that have low bits clear...
-    //
-    if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
-    {
-        put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT);
-        entry->writable_pl1e = -2;
-    }
-    else
-        ASSERT( entry->writable_pl1e == -1 );
-
-    // Free the snapshot
-    //
-    shadow_free_snapshot(d, entry);
-}
-
-static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn)
-{
-    struct out_of_sync_entry *entry = d->arch.out_of_sync;
-    struct out_of_sync_entry **prev = &d->arch.out_of_sync;
-    struct out_of_sync_entry *found = NULL;
-
-    // NB: Be careful not to call something that manipulates this list
-    //     while walking it.  Collect the results into a separate list
-    //     first, then walk that list.
-    //
-    while ( entry )
-    {
-        if ( entry->gmfn == gmfn )
-        {
-            // remove from out of sync list
-            *prev = entry->next;
-
-            // add to found list
-            entry->next = found;
-            found = entry;
-
-            entry = *prev;
-            continue;
-        }
-        prev = &entry->next;
-        entry = entry->next;
-    }
-
-    prev = NULL;
-    entry = found;
-    while ( entry )
-    {
-        release_out_of_sync_entry(d, entry);
-
-        prev = &entry->next;
-        entry = entry->next;
-    }
-
-    // Add found list to free list
-    if ( prev )
-    {
-        *prev = d->arch.out_of_sync_free;
-        d->arch.out_of_sync_free = found;
-    }
-}
-
-static void free_out_of_sync_state(struct domain *d)
-{
-    struct out_of_sync_entry *entry;
-
-    // NB: Be careful not to call something that manipulates this list
-    //     while walking it.  Remove one item at a time, and always
-    //     restart from start of list.
-    //
-    while ( (entry = d->arch.out_of_sync) )
-    {
-        d->arch.out_of_sync = entry->next;
-        release_out_of_sync_entry(d, entry);
-
-        entry->next = d->arch.out_of_sync_free;
-        d->arch.out_of_sync_free = entry;
-    }
-}
-
-static void free_shadow_pages(struct domain *d)
-{
-    int                   i;
-    struct shadow_status *x;
-    struct vcpu          *v;
-    struct list_head *list_ent, *tmp;
-    /*
-     * WARNING! The shadow page table must not currently be in use!
-     * e.g., You are expected to have paused the domain and synchronized CR3.
-     */
-
-    if( !d->arch.shadow_ht ) return;
-
-    shadow_audit(d, 1);
-
-    // first, remove any outstanding refs from out_of_sync entries...
-    //
-    free_out_of_sync_state(d);
-
-    // second, remove any outstanding refs from v->arch.shadow_table
-    // and CR3.
-    //
-    for_each_vcpu(d, v)
-    {
-        if ( pagetable_get_paddr(v->arch.shadow_table) )
-        {
-            put_shadow_ref(pagetable_get_pfn(v->arch.shadow_table));
-            v->arch.shadow_table = pagetable_null();
-
-            if ( shadow_mode_external(d) )
-            {
-                if ( v->arch.shadow_vtable )
-                    unmap_domain_page_global(v->arch.shadow_vtable);
-                v->arch.shadow_vtable = NULL;
-            }
-        }
-
-        if ( v->arch.monitor_shadow_ref )
-        {
-            put_shadow_ref(v->arch.monitor_shadow_ref);
-            v->arch.monitor_shadow_ref = 0;
-        }
-    }
-
-    // For external shadows, remove the monitor table's refs
-    //
-    if ( shadow_mode_external(d) )
-    {
-        for_each_vcpu(d, v)
-        {
-            l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
-
-            if ( mpl2e )
-            {
-                l2_pgentry_t hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
-                l2_pgentry_t smfn = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
-
-                if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
-                {
-                    put_shadow_ref(l2e_get_pfn(hl2e));
-                    mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
-                }
-                if ( l2e_get_flags(smfn) & _PAGE_PRESENT )
-                {
-                    put_shadow_ref(l2e_get_pfn(smfn));
-                    mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
-                }
-            }
-        }
-    }
-
-    // Now, the only refs to shadow pages that are left are from the shadow
-    // pages themselves.  We just unpin the pinned pages, and the rest
-    // should automatically disappear.
-    //
-    // NB: Beware: each explicitly or implicit call to free_shadow_page
-    // can/will result in the hash bucket getting rewritten out from
-    // under us...  First, collect the list of pinned pages, then
-    // free them.
-    //
-    // FIXME: it would be good to just free all the pages referred to in
-    // the hash table without going through each of them to decrement their
-    // reference counts.  In shadow_mode_refcount(), we've gotta do the hard
-    // work, but only for L1 shadows.  If we're not in refcount mode, then
-    // there's no real hard work to do at all.  Need to be careful with the
-    // writable_pte_predictions and snapshot entries in the hash table, but
-    // that's about it.
-    //
-    for ( i = 0; i < shadow_ht_buckets; i++ )
-    {
-        u32 count;
-        unsigned long *mfn_list;
-
-        /* Skip empty buckets. */
-        if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
-            continue;
-
-        count = 0;
-
-        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) {
-           /* Skip entries that are writable_pred) */
-           switch(x->gpfn_and_flags & PGT_type_mask){
-               case PGT_l1_shadow:
-               case PGT_l2_shadow:
-               case PGT_l3_shadow:
-               case PGT_l4_shadow:
-               case PGT_hl2_shadow:
-                   if ( MFN_PINNED(x->smfn) )
-                       count++;
-                   break;
-               case PGT_snapshot:
-               case PGT_writable_pred:
-                   break;
-               default:
-                   BUG();
-
-           }
-       }
-
-        if ( !count )
-            continue;
-
-        mfn_list = xmalloc_array(unsigned long, count);
-        count = 0;
-        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) {
-           /* Skip entries that are writable_pred) */
-           switch(x->gpfn_and_flags & PGT_type_mask){
-               case PGT_l1_shadow:
-               case PGT_l2_shadow:
-               case PGT_l3_shadow:
-               case PGT_l4_shadow:
-               case PGT_hl2_shadow:
-                   if ( MFN_PINNED(x->smfn) )
-                       mfn_list[count++] = x->smfn;
-                   break;
-               case PGT_snapshot:
-               case PGT_writable_pred:
-                   break;
-               default:
-                   BUG();
-
-           }
-       }
-
-        while ( count )
-        {
-            shadow_unpin(mfn_list[--count]);
-        }
-        xfree(mfn_list);
-    }
-
-    /* Now free the pre-zero'ed pages from the domain */
-    list_for_each_safe(list_ent, tmp, &d->arch.free_shadow_frames)
-    {
-        struct page_info *page = list_entry(list_ent, struct page_info, list);
-
-        list_del(list_ent);
-        perfc_decr(free_l1_pages);
-
-        free_domheap_page(page);
-    }
-
-    shadow_audit(d, 0);
-
-    SH_VLOG("Free shadow table.");
-}
-
-void shadow_mode_init(void)
-{
-}
-
-int _shadow_mode_refcounts(struct domain *d)
-{
-    return shadow_mode_refcounts(d);
-}
-
-static void alloc_monitor_pagetable(struct vcpu *v)
-{
-    unsigned long mmfn;
-    l2_pgentry_t *mpl2e;
-    struct page_info *mmfn_info;
-    struct domain *d = v->domain;
-    int i;
-
-    ASSERT(pagetable_get_paddr(v->arch.monitor_table) == 0);
-
-    mmfn_info = alloc_domheap_page(NULL);
-    ASSERT(mmfn_info != NULL);
-
-    mmfn = page_to_mfn(mmfn_info);
-    mpl2e = (l2_pgentry_t *)map_domain_page_global(mmfn);
-    memset(mpl2e, 0, PAGE_SIZE);
-
-    memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
-           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
-           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
-
-    for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
-        mpl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
-            l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
-                          __PAGE_HYPERVISOR);
-
-    // Don't (yet) have mappings for these...
-    // Don't want to accidentally see the idle_pg_table's linear mapping.
-    //
-    mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
-    mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
-    mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = l2e_empty();
-
-    v->arch.monitor_table = pagetable_from_pfn(mmfn);
-    v->arch.monitor_vtable = mpl2e;
-
-    if ( v->vcpu_id == 0 )
-        alloc_p2m_table(d);
-    else
-    {
-        unsigned long mfn;
-
-        mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
-        if ( mfn )
-        {
-            l2_pgentry_t *l2tab;
-
-            l2tab = map_domain_page(mfn);
-
-            mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
-                l2tab[l2_table_offset(RO_MPT_VIRT_START)];
-
-            unmap_domain_page(l2tab);
-        }
-    }
-}
-
-/*
- * Free the pages for monitor_table and hl2_table
- */
-void free_monitor_pagetable(struct vcpu *v)
-{
-    l2_pgentry_t *mpl2e, hl2e, sl2e;
-    unsigned long mfn;
-
-    ASSERT( pagetable_get_paddr(v->arch.monitor_table) );
-
-    mpl2e = v->arch.monitor_vtable;
-
-    /*
-     * First get the mfn for hl2_table by looking at monitor_table
-     */
-    hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
-    if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
-    {
-        mfn = l2e_get_pfn(hl2e);
-        ASSERT(mfn);
-        put_shadow_ref(mfn);
-    }
-
-    sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
-    if ( l2e_get_flags(sl2e) & _PAGE_PRESENT )
-    {
-        mfn = l2e_get_pfn(sl2e);
-        ASSERT(mfn);
-        put_shadow_ref(mfn);
-    }
-
-    if ( v->vcpu_id == 0 )
-        free_p2m_table(v->domain);
-
-    /*
-     * Then free monitor_table.
-     */
-    mfn = pagetable_get_pfn(v->arch.monitor_table);
-    unmap_domain_page_global(v->arch.monitor_vtable);
-    free_domheap_page(mfn_to_page(mfn));
-
-    v->arch.monitor_table = pagetable_null();
-    v->arch.monitor_vtable = 0;
-}
-
-static int
-map_p2m_entry(l1_pgentry_t *l1tab, unsigned long gpfn, unsigned long mfn)
-{
-    unsigned long *l0tab = NULL;
-    l1_pgentry_t l1e = { 0 };
-    struct page_info *page;
-    unsigned long va = RO_MPT_VIRT_START + (gpfn * sizeof(mfn));
-
-    l1e = l1tab[l1_table_offset(va)];
-    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
-    {
-        page = alloc_domheap_page(NULL);
-        if ( !page )
-            return 0;
-
-        l0tab = map_domain_page(page_to_mfn(page));
-        memset(l0tab, 0, PAGE_SIZE);
-
-        l1e = l1tab[l1_table_offset(va)] =
-            l1e_from_page(page, __PAGE_HYPERVISOR);
-    }
-    else
-        l0tab = map_domain_page(l1e_get_pfn(l1e));
-
-    l0tab[gpfn & ((PAGE_SIZE / sizeof(mfn)) - 1)] = mfn;
-
-    unmap_domain_page(l0tab);
-
-    return 1;
-}
-
-int
-set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn,
-              struct domain_mmap_cache *l2cache,
-              struct domain_mmap_cache *l1cache)
-{
-    unsigned long tabpfn;
-    l2_pgentry_t *l2, l2e;
-    l1_pgentry_t *l1;
-    struct page_info *l1page;
-    unsigned long va = pfn << PAGE_SHIFT;
-
-    if ( shadow_mode_external(d) )
-        tabpfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
-    else
-        tabpfn = pagetable_get_pfn(d->arch.phys_table);
-
-    ASSERT(tabpfn != 0);
-    ASSERT(shadow_lock_is_acquired(d));
-
-    l2 = map_domain_page_with_cache(tabpfn, l2cache);
-
-    /*
-     * The following code covers (SHM_translate | SHM_external) mode.
-     */
-
-    if ( shadow_mode_external(d) )
-    {
-        int error;
-        l1_pgentry_t *l1tab = NULL;
-        l2_pgentry_t l2e;
-
-        l2e = l2[l2_table_offset(RO_MPT_VIRT_START)];
-
-        ASSERT( l2e_get_flags(l2e) & _PAGE_PRESENT );
-
-        l1tab = map_domain_page(l2e_get_pfn(l2e));
-        if ( !(error = map_p2m_entry(l1tab, pfn, mfn)) )
-            domain_crash(d);
-
-        unmap_domain_page(l1tab);
-        unmap_domain_page_with_cache(l2, l2cache);
-
-        return error;
-    }
-
-    /*
-     * The following code covers SHM_translate mode.
-     */
-    ASSERT(shadow_mode_translate(d));
-
-    l2e = l2[l2_table_offset(va)];
-    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
-    {
-        l1page = alloc_domheap_page(NULL);
-        if ( !l1page )
-        {
-            unmap_domain_page_with_cache(l2, l2cache);
-            return 0;
-        }
-
-        l1 = map_domain_page_with_cache(page_to_mfn(l1page), l1cache);
-        /* Initialise entries to INVALID_MFN = ~0 */
-        memset(l1, -1, PAGE_SIZE);
-        unmap_domain_page_with_cache(l1, l1cache);
-
-        l2e = l2e_from_page(l1page, __PAGE_HYPERVISOR);
-        l2[l2_table_offset(va)] = l2e;
-    }
-    unmap_domain_page_with_cache(l2, l2cache);
-
-    l1 = map_domain_page_with_cache(l2e_get_pfn(l2e), l1cache);
-    l1[l1_table_offset(va)] = (l1_pgentry_t) { mfn };
-    unmap_domain_page_with_cache(l1, l1cache);
-
-    return 1;
-}
-
-static int
-alloc_p2m_table(struct domain *d)
-{
-    struct list_head *list_ent;
-
-    l2_pgentry_t *l2tab = NULL;
-    l1_pgentry_t *l1tab = NULL;
-    l2_pgentry_t l2e = { 0 };
-    struct page_info *page;
-    unsigned long gpfn, mfn;
-    int error = 0;
-
-    if ( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) )
-    {
-        l2tab = map_domain_page(
-            pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
-        l2e = l2tab[l2_table_offset(RO_MPT_VIRT_START)];
-        if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
-        {
-            page = alloc_domheap_page(NULL);
-
-            l1tab = map_domain_page(page_to_mfn(page));
-            memset(l1tab, 0, PAGE_SIZE);
-            l2e = l2tab[l2_table_offset(RO_MPT_VIRT_START)] =
-                l2e_from_page(page, __PAGE_HYPERVISOR);
-        }
-        else
-            l1tab = map_domain_page(l2e_get_pfn(l2e));
-
-        if ( l2tab )
-            unmap_domain_page(l2tab);
-    }
-    else
-    {
-        page = alloc_domheap_page(NULL);
-        if (!page)
-        {
-            printk("Alloc p2m table fail\n");
-            domain_crash(d);
-        }
-
-        l1tab = map_domain_page(page_to_mfn(page));
-        memset(l1tab, 0, PAGE_SIZE);
-        d->arch.phys_table = pagetable_from_page(page);
-    }
-
-    list_ent = d->page_list.next;
-
-    while ( list_ent != &d->page_list )
-    {
-        page = list_entry(list_ent, struct page_info, list);
-        mfn = page_to_mfn(page);
-
-        gpfn = get_gpfn_from_mfn(mfn);
-
-        if ( !(error = map_p2m_entry(l1tab, gpfn, mfn)) )
-        {
-            domain_crash(d);
-            break;
-        }
-
-        list_ent = page->list.next;
-    }
-
-    unmap_domain_page(l1tab);
-
-    return error;
-}
-
-static void
-free_p2m_table(struct domain *d)
-{
-    unsigned long va;
-    l2_pgentry_t *l2tab;
-    l1_pgentry_t *l1tab;
-    l2_pgentry_t l2e;
-    l1_pgentry_t l1e;
-
-    ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
-
-    l2tab = map_domain_page(
-        pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
-
-    for ( va = RO_MPT_VIRT_START; va < RO_MPT_VIRT_END; )
-    {
-        int i;
-
-        l2e = l2tab[l2_table_offset(va)];
-        if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
-        {
-            l1tab = map_domain_page(l2e_get_pfn(l2e));
-            for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++)
-            {
-                l1e = l1tab[l1_table_offset(va)];
-
-                if ( l1e_get_flags(l1e) & _PAGE_PRESENT )
-                    free_domheap_page(mfn_to_page(l1e_get_pfn(l1e)));
-                va += PAGE_SIZE;
-            }
-            unmap_domain_page(l1tab);
-            free_domheap_page(mfn_to_page(l2e_get_pfn(l2e)));
-        }
-        else
-            va += PAGE_SIZE * L1_PAGETABLE_ENTRIES;
-    }
-    unmap_domain_page(l2tab);
-}
-
-int shadow_direct_map_fault(unsigned long vpa, struct cpu_user_regs *regs)
-{
-    struct vcpu *v = current;
-    struct domain *d = v->domain;
-    l2_pgentry_t sl2e;
-    l1_pgentry_t sl1e;
-    l1_pgentry_t *sple = NULL;
-    unsigned long mfn, smfn;
-    struct page_info *page;
-
-    /*
-     * If the faulting address is within the MMIO range, we continue
-     * on handling the #PF as such.
-     */
-    if ( (mfn = get_mfn_from_gpfn(vpa >> PAGE_SHIFT)) == INVALID_MFN )
-        return 0;
-
-    shadow_lock(d);
-
-   __direct_get_l2e(v, vpa, &sl2e);
-
-    if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
-    {
-        page = alloc_domheap_page(NULL);
-        if ( !page )
-            goto nomem;
-
-        smfn = page_to_mfn(page);
-        sl2e = l2e_from_pfn(smfn, __PAGE_HYPERVISOR | _PAGE_USER);
-
-        sple = (l1_pgentry_t *)map_domain_page(smfn);
-        memset(sple, 0, PAGE_SIZE);
-        __direct_set_l2e(v, vpa, sl2e);
-    }
-
-    if ( !sple )
-        sple = (l1_pgentry_t *)map_domain_page(l2e_get_pfn(sl2e));
-
-    sl1e = sple[l1_table_offset(vpa)];
-
-    if ( !(l1e_get_flags(sl1e) & _PAGE_PRESENT) )
-    {
-        sl1e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR | _PAGE_USER);
-        sple[l1_table_offset(vpa)] = sl1e;
-    }
-
-    if (sple)
-        unmap_domain_page(sple);
-
-    shadow_unlock(d);
-    return EXCRET_fault_fixed;
-
-nomem:
-    shadow_direct_map_clean(d);
-    domain_crash_synchronous();
-}
-
-
-int shadow_direct_map_init(struct domain *d)
-{
-    struct page_info *page;
-    l2_pgentry_t *root;
-
-    if ( !(page = alloc_domheap_page(NULL)) )
-        return 0;
-
-    root = map_domain_page(page_to_mfn(page));
-    memset(root, 0, PAGE_SIZE);
-    unmap_domain_page(root);
-
-    d->arch.phys_table = pagetable_from_page(page);
-
-    return 1;
-}
-
-void shadow_direct_map_clean(struct domain *d)
-{
-    int i;
-    unsigned long mfn;
-    l2_pgentry_t *l2e;
-
-    mfn =  pagetable_get_pfn(d->arch.phys_table);
-
-    /*
-     * We may fail very early before direct map is built.
-     */
-    if ( !mfn )
-        return;
-
-    l2e = map_domain_page(mfn);
-
-    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
-    {
-        if ( l2e_get_flags(l2e[i]) & _PAGE_PRESENT )
-            free_domheap_page(mfn_to_page(l2e_get_pfn(l2e[i])));
-    }
-    free_domheap_page(mfn_to_page(mfn));
-
-    unmap_domain_page(l2e);
-
-    d->arch.phys_table = pagetable_null();
-}
-
-int __shadow_mode_enable(struct domain *d, unsigned int mode)
-{
-    struct vcpu *v;
-    int new_modes = (mode & ~d->arch.shadow_mode);
-
-    if(!new_modes) /* Nothing to do - return success */
-        return 0;
-
-    // can't take anything away by calling this function.
-    ASSERT(!(d->arch.shadow_mode & ~mode));
-
-    for_each_vcpu(d, v)
-    {
-        invalidate_shadow_ldt(v);
-
-        // We need to set these up for __update_pagetables().
-        // See the comment there.
-
-        /*
-         * arch.guest_vtable
-         */
-        if ( v->arch.guest_vtable &&
-             (v->arch.guest_vtable != __linear_l2_table) )
-        {
-            unmap_domain_page_global(v->arch.guest_vtable);
-        }
-        if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
-            v->arch.guest_vtable = __linear_l2_table;
-        else
-            v->arch.guest_vtable = NULL;
-
-        /*
-         * arch.shadow_vtable
-         */
-        if ( v->arch.shadow_vtable &&
-             (v->arch.shadow_vtable != __shadow_linear_l2_table) )
-        {
-            unmap_domain_page_global(v->arch.shadow_vtable);
-        }
-        if ( !(mode & SHM_external) )
-            v->arch.shadow_vtable = __shadow_linear_l2_table;
-        else
-            v->arch.shadow_vtable = NULL;
-
-        /*
-         * arch.hl2_vtable
-         */
-        if ( v->arch.hl2_vtable &&
-             (v->arch.hl2_vtable != __linear_hl2_table) )
-        {
-            unmap_domain_page_global(v->arch.hl2_vtable);
-        }
-        if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
-            v->arch.hl2_vtable = __linear_hl2_table;
-        else
-            v->arch.hl2_vtable = NULL;
-
-        /*
-         * arch.monitor_table & arch.monitor_vtable
-         */
-        if ( v->arch.monitor_vtable )
-        {
-            free_monitor_pagetable(v);
-        }
-        if ( mode & SHM_external )
-        {
-            alloc_monitor_pagetable(v);
-        }
-    }
-
-    if ( new_modes & SHM_enable )
-    {
-        ASSERT( !d->arch.shadow_ht );
-        d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets);
-        if ( d->arch.shadow_ht == NULL )
-            goto nomem;
-
-        memset(d->arch.shadow_ht, 0,
-           shadow_ht_buckets * sizeof(struct shadow_status));
-    }
-
-    if ( new_modes & SHM_log_dirty )
-    {
-        ASSERT( !d->arch.shadow_dirty_bitmap );
-        d->arch.shadow_dirty_bitmap_size = 
-            (d->shared_info->arch.max_pfn +  63) & ~63;
-        d->arch.shadow_dirty_bitmap = 
-            xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size /
-                                         (8 * sizeof(unsigned long)));
-        if ( d->arch.shadow_dirty_bitmap == NULL )
-        {
-            d->arch.shadow_dirty_bitmap_size = 0;
-            goto nomem;
-        }
-        memset(d->arch.shadow_dirty_bitmap, 0, 
-               d->arch.shadow_dirty_bitmap_size/8);
-    }
-
-    if ( new_modes & SHM_translate )
-    {
-        if ( !(new_modes & SHM_external) )
-        {
-            ASSERT( !pagetable_get_paddr(d->arch.phys_table) );
-            if ( !alloc_p2m_table(d) )
-            {
-                printk("alloc_p2m_table failed (out-of-memory?)\n");
-                goto nomem;
-            }
-        }
-    }
-
-    // Get rid of any shadow pages from any previous shadow mode.
-    //
-    free_shadow_pages(d);
-
-    d->arch.shadow_mode = mode;
-
-    if ( shadow_mode_refcounts(d) )
-    {
-        struct list_head *list_ent;
-        struct page_info *page;
-
-        /*
-         * Tear down its counts by disassembling its page-table-based refcounts
-         * Also remove CR3's gcount/tcount.
-         * That leaves things like GDTs and LDTs and external refs in tact.
-         *
-         * Most pages will be writable tcount=0.
-         * Some will still be L1 tcount=0 or L2 tcount=0.
-         * Maybe some pages will be type none tcount=0.
-         * Pages granted external writable refs (via grant tables?) will
-         * still have a non-zero tcount.  That's OK.
-         *
-         * gcounts will generally be 1 for PGC_allocated.
-         * GDTs and LDTs will have additional gcounts.
-         * Any grant-table based refs will still be in the gcount.
-         *
-         * We attempt to grab writable refs to each page thus setting its type
-         * Immediately put back those type refs.
-         *
-         * Assert that no pages are left with L1/L2/L3/L4 type.
-         */
-        audit_adjust_pgtables(d, -1, 1);
-
-
-        for (list_ent = d->page_list.next; list_ent != &d->page_list;
-             list_ent = page->list.next) {
-            
-            page = list_entry(list_ent, struct page_info, list);
-
-            if ( !get_page_type(page, PGT_writable_page) )
-                BUG();
-            put_page_type(page);
-            /*
-             * We use tlbflush_timestamp as back pointer to smfn, and need to
-             * clean up it.
-             */
-            if (shadow_mode_external(d))
-                page->tlbflush_timestamp = 0;
-        }
-        
-        audit_adjust_pgtables(d, 1, 1);
-  
-    }
-
-    return 0;
-
- nomem:
-    if ( (new_modes & SHM_enable) )
-    {
-        xfree(d->arch.shadow_ht);
-        d->arch.shadow_ht = NULL;
-    }
-    if ( (new_modes & SHM_log_dirty) )
-    {
-        xfree(d->arch.shadow_dirty_bitmap);
-        d->arch.shadow_dirty_bitmap = NULL;
-    }
-
-    return -ENOMEM;
-}
-
-int shadow_mode_enable(struct domain *d, unsigned int mode)
-{
-    int rc;
-    shadow_lock(d);
-    rc = __shadow_mode_enable(d, mode);
-    shadow_unlock(d);
-    return rc;
-}
-
-static void
-translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn)
-{
-    int i;
-    l1_pgentry_t *l1;
-
-    l1 = map_domain_page(l1mfn);
-    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
-    {
-        if ( is_guest_l1_slot(i) &&
-             (l1e_get_flags(l1[i]) & _PAGE_PRESENT) )
-        {
-            unsigned long mfn = l1e_get_pfn(l1[i]);
-            unsigned long gpfn = mfn_to_gmfn(d, mfn);
-            ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
-            l1[i] = l1e_from_pfn(gpfn, l1e_get_flags(l1[i]));
-        }
-    }
-    unmap_domain_page(l1);
-}
-
-// This is not general enough to handle arbitrary pagetables
-// with shared L1 pages, etc., but it is sufficient for bringing
-// up dom0.
-//
-void
-translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn,
-                    unsigned int type)
-{
-    int i;
-    l2_pgentry_t *l2;
-
-    ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d));
-
-    l2 = map_domain_page(l2mfn);
-    for (i = 0; i < L2_PAGETABLE_ENTRIES; i++)
-    {
-        if ( is_guest_l2_slot(type, i) &&
-             (l2e_get_flags(l2[i]) & _PAGE_PRESENT) )
-        {
-            unsigned long mfn = l2e_get_pfn(l2[i]);
-            unsigned long gpfn = mfn_to_gmfn(d, mfn);
-            ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
-            l2[i] = l2e_from_pfn(gpfn, l2e_get_flags(l2[i]));
-            translate_l1pgtable(d, p2m, mfn);
-        }
-    }
-    unmap_domain_page(l2);
-}
-
-static void free_shadow_ht_entries(struct domain *d)
-{
-    struct shadow_status *x, *n;
-
-    SH_VLOG("freed tables count=%d l1=%d l2=%d",
-            d->arch.shadow_page_count, perfc_value(shadow_l1_pages), 
-            perfc_value(shadow_l2_pages));
-
-    n = d->arch.shadow_ht_extras;
-    while ( (x = n) != NULL )
-    {
-        d->arch.shadow_extras_count--;
-        n = *((struct shadow_status **)(&x[shadow_ht_extra_size]));
-        xfree(x);
-    }
-
-    d->arch.shadow_ht_extras = NULL;
-    d->arch.shadow_ht_free = NULL;
-
-    ASSERT(d->arch.shadow_extras_count == 0);
-    SH_VLOG("freed extras, now %d", d->arch.shadow_extras_count);
-
-    if ( d->arch.shadow_dirty_bitmap != NULL )
-    {
-        xfree(d->arch.shadow_dirty_bitmap);
-        d->arch.shadow_dirty_bitmap = 0;
-        d->arch.shadow_dirty_bitmap_size = 0;
-    }
-
-    xfree(d->arch.shadow_ht);
-    d->arch.shadow_ht = NULL;
-}
-
-static void free_out_of_sync_entries(struct domain *d)
-{
-    struct out_of_sync_entry *x, *n;
-
-    n = d->arch.out_of_sync_extras;
-    while ( (x = n) != NULL )
-    {
-        d->arch.out_of_sync_extras_count--;
-        n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size]));
-        xfree(x);
-    }
-
-    d->arch.out_of_sync_extras = NULL;
-    d->arch.out_of_sync_free = NULL;
-    d->arch.out_of_sync = NULL;
-
-    ASSERT(d->arch.out_of_sync_extras_count == 0);
-    FSH_LOG("freed extra out_of_sync entries, now %d",
-            d->arch.out_of_sync_extras_count);
-}
-
-void __shadow_mode_disable(struct domain *d)
-{
-    struct vcpu *v;
-#ifndef NDEBUG
-    int i;
-#endif
-
-    if ( unlikely(!shadow_mode_enabled(d)) )
-        return;
-
-    free_shadow_pages(d);
-    free_writable_pte_predictions(d);
-
-#ifndef NDEBUG
-    for ( i = 0; i < shadow_ht_buckets; i++ )
-    {
-        if ( d->arch.shadow_ht[i].gpfn_and_flags != 0 )
-        {
-            printk("%s: d->arch.shadow_ht[%x].gpfn_and_flags=%lx\n",
-                   __FILE__, i, d->arch.shadow_ht[i].gpfn_and_flags);
-            BUG();
-        }
-    }
-#endif
-
-    d->arch.shadow_mode = 0;
-
-    free_shadow_ht_entries(d);
-    free_out_of_sync_entries(d);
-
-    for_each_vcpu(d, v)
-        update_pagetables(v);
-}
-
-static int shadow_mode_table_op(
-    struct domain *d, dom0_shadow_control_t *sc)
-{
-    unsigned int      op = sc->op;
-    int               i, rc = 0;
-    struct vcpu *v;
-
-    ASSERT(shadow_lock_is_acquired(d));
-
-    SH_VLOG("shadow mode table op %lx %lx count %d",
-            (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.guest_table),  /* XXX SMP */
-            (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.shadow_table), /* XXX SMP */
-            d->arch.shadow_page_count);
-
-    shadow_audit(d, 1);
-
-    switch ( op )
-    {
-    case DOM0_SHADOW_CONTROL_OP_FLUSH:
-        free_shadow_pages(d);
-
-        d->arch.shadow_fault_count       = 0;
-        d->arch.shadow_dirty_count       = 0;
-
-        break;
-   
-    case DOM0_SHADOW_CONTROL_OP_CLEAN:
-        free_shadow_pages(d);
-
-        sc->stats.fault_count       = d->arch.shadow_fault_count;
-        sc->stats.dirty_count       = d->arch.shadow_dirty_count;
-
-        d->arch.shadow_fault_count       = 0;
-        d->arch.shadow_dirty_count       = 0;
-        if ( guest_handle_is_null(sc->dirty_bitmap) ||
-             (d->arch.shadow_dirty_bitmap == NULL) )
-        {
-            rc = -EINVAL;
-            break;
-        }
-
-        if ( sc->pages > d->arch.shadow_dirty_bitmap_size )
-            sc->pages = d->arch.shadow_dirty_bitmap_size;
-
-#define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
-        for ( i = 0; i < sc->pages; i += chunk )
-        {
-            int bytes = ((((sc->pages - i) > chunk) ?
-                          chunk : (sc->pages - i)) + 7) / 8;
-     
-            if ( copy_to_guest_offset(
-                sc->dirty_bitmap, i/(8*sizeof(unsigned long)),
-                d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
-                (bytes+sizeof(unsigned long)-1) / sizeof(unsigned long)) )
-            {
-                rc = -EINVAL;
-                break;
-            }
-
-            memset(
-                d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
-                0, bytes);
-        }
-
-        break;
-
-    case DOM0_SHADOW_CONTROL_OP_PEEK:
-        sc->stats.fault_count       = d->arch.shadow_fault_count;
-        sc->stats.dirty_count       = d->arch.shadow_dirty_count;
-
-        if ( guest_handle_is_null(sc->dirty_bitmap) ||
-             (d->arch.shadow_dirty_bitmap == NULL) )
-        {
-            rc = -EINVAL;
-            break;
-        }
-        if ( sc->pages > d->arch.shadow_dirty_bitmap_size )
-            sc->pages = d->arch.shadow_dirty_bitmap_size;
-
-        if ( copy_to_guest(sc->dirty_bitmap, 
-                           d->arch.shadow_dirty_bitmap,
-                           (((sc->pages+7)/8)+sizeof(unsigned long)-1) /
-                           sizeof(unsigned long)) )
-        {
-            rc = -EINVAL;
-            break;
-        }
-
-        break;
-
-    default:
-        rc = -EINVAL;
-        break;
-    }
-
-    SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count);
-    shadow_audit(d, 1);
-
-    for_each_vcpu(d,v)
-        __update_pagetables(v);
-
-    return rc;
-}
-
-int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
-{
-    unsigned int op = sc->op;
-    int          rc = 0;
-    struct vcpu *v;
-
-    if ( unlikely(d == current->domain) )
-    {
-        DPRINTK("Don't try to do a shadow op on yourself!\n");
-        return -EINVAL;
-    }   
-
-    domain_pause(d);
-
-    shadow_lock(d);
-
-    switch ( op )
-    {
-    case DOM0_SHADOW_CONTROL_OP_OFF:
-        if ( shadow_mode_enabled(d) )
-        {
-            __shadow_sync_all(d);
-            __shadow_mode_disable(d);
-        }
-        break;
-
-    case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
-        free_shadow_pages(d);
-        rc = __shadow_mode_enable(d, SHM_enable);
-        break;
-
-    case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
-        free_shadow_pages(d);
-        rc = __shadow_mode_enable(
-            d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty);
-        break;
-
-    case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
-        free_shadow_pages(d);
-        rc = __shadow_mode_enable(
-            d, d->arch.shadow_mode|SHM_enable|SHM_refcounts|SHM_translate|SHM_wr_pt_pte);
-        break;
-
-    default:
-        rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL;
-        break;
-    }
-
-    shadow_unlock(d);
-
-    for_each_vcpu(d,v)
-        update_pagetables(v);
-
-    domain_unpause(d);
-
-    return rc;
-}
-
-unsigned long
-get_mfn_from_gpfn_foreign(struct domain *d, unsigned long gpfn)
-{
-    unsigned long va, tabpfn;
-    l1_pgentry_t *l1, l1e;
-    l2_pgentry_t *l2, l2e;
-
-    ASSERT(shadow_mode_translate(d));
-
-    perfc_incrc(get_mfn_from_gpfn_foreign);
-
-    if ( shadow_mode_external(d) )
-    {
-        unsigned long mfn;
-        unsigned long *l0;
-
-        va = RO_MPT_VIRT_START + (gpfn * sizeof(mfn));
-
-        tabpfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
-        if ( !tabpfn )
-            return INVALID_MFN;
-
-        l2 = map_domain_page(tabpfn);
-        l2e = l2[l2_table_offset(va)];
-        unmap_domain_page(l2);
-        if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
-            return INVALID_MFN;
-
-        l1 = map_domain_page(l2e_get_pfn(l2e));
-        l1e = l1[l1_table_offset(va)];
-        unmap_domain_page(l1);
-        if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
-            return INVALID_MFN;
-
-        l0 = map_domain_page(l1e_get_pfn(l1e));
-        mfn = l0[gpfn & ((PAGE_SIZE / sizeof(mfn)) - 1)];
-        unmap_domain_page(l0);
-        return mfn;
-    }
-    else
-    {
-        va = gpfn << PAGE_SHIFT;
-        tabpfn = pagetable_get_pfn(d->arch.phys_table);
-        l2 = map_domain_page(tabpfn);
-        l2e = l2[l2_table_offset(va)];
-        unmap_domain_page(l2);
-        if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
-        {
-#if 0
-            printk("%s(d->id=%d, gpfn=%lx) => 0 l2e=%" PRIpte "\n",
-                   __func__, d->domain_id, gpfn, l2e_get_intpte(l2e));
-#endif
-            return INVALID_MFN;
-        }
-        l1 = map_domain_page(l2e_get_pfn(l2e));
-        l1e = l1[l1_table_offset(va)];
-        unmap_domain_page(l1);
-#if 0
-        printk("%s(d->id=%d, gpfn=%lx) => %lx tabpfn=%lx l2e=%lx l1tab=%lx, l1e=%lx\n",
-               __func__, d->domain_id, gpfn, l1_pgentry_val(l1e) >> PAGE_SHIFT, tabpfn, l2e, l1tab, l1e);
-#endif
-
-        return l1e_get_intpte(l1e);
-    }
-
-}
-
-static unsigned long
-shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn,
-                unsigned long smfn)
-{
-    unsigned long hl2mfn;
-    l1_pgentry_t *hl2;
-    l2_pgentry_t *gpgd;
-    int limit;
-    int x;
-
-    ASSERT(PGT_base_page_table == PGT_l2_page_table);
-
-    if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) )
-    {
-        printk("Couldn't alloc an HL2 shadow for pfn=%lx mfn=%lx\n",
-               gpfn, gmfn);
-        BUG(); /* XXX Deal gracefully with failure. */
-    }
-
-    SH_VVLOG("shadow_hl2_table(gpfn=%lx, gmfn=%lx, smfn=%lx) => %lx",
-             gpfn, gmfn, smfn, hl2mfn);
-    perfc_incrc(shadow_hl2_table_count);
-
-    hl2 = map_domain_page(hl2mfn);
-
-    if ( shadow_mode_external(d) )
-        limit = L2_PAGETABLE_ENTRIES;
-    else
-        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-
-    memset(hl2, 0, limit * sizeof(l1_pgentry_t));
-
-    if ( !shadow_mode_external(d) )
-    {
-        memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0,
-               HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
-
-        // Setup easy access to the GL2, SL2, and HL2 frames.
-        //
-        hl2[l2_table_offset(LINEAR_PT_VIRT_START)] =
-            l1e_from_pfn(gmfn, __PAGE_HYPERVISOR);
-        hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
-            l1e_from_pfn(smfn, __PAGE_HYPERVISOR);
-        hl2[l2_table_offset(PERDOMAIN_VIRT_START)] =
-            l1e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
-    }
-
-    gpgd = map_domain_page(gmfn);
-    for (x = 0; x < DOMAIN_ENTRIES_PER_L2_PAGETABLE; x++)
-        validate_hl2e_change(d, gpgd[x], &hl2[x]);
-    unmap_domain_page(gpgd);
-
-    unmap_domain_page(hl2);
-
-    return hl2mfn;
-}
-
-/*
- * This could take and use a snapshot, and validate the entire page at
- * once, or it could continue to fault in entries one at a time...
- * Might be worth investigating...
- */
-static unsigned long shadow_l2_table(
-    struct domain *d, unsigned long gpfn, unsigned long gmfn)
-{
-    unsigned long smfn;
-    l2_pgentry_t *spl2e;
-    int i;
-
-    SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn);
-
-    perfc_incrc(shadow_l2_table_count);
-
-    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) )
-    {
-        printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n",
-               gpfn, gmfn);
-        BUG(); /* XXX Deal gracefully with failure. */
-    }
-
-    spl2e = (l2_pgentry_t *)map_domain_page(smfn);
-
-    /* Install hypervisor and 2x linear p.t. mapings. */
-    if ( (PGT_base_page_table == PGT_l2_page_table) &&
-         !shadow_mode_external(d) )
-    {
-        /*
-         * We could proactively fill in PDEs for pages that are already
-         * shadowed *and* where the guest PDE has _PAGE_ACCESSED set
-         * (restriction required for coherence of the accessed bit). However,
-         * we tried it and it didn't help performance. This is simpler. 
-         */
-        memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
-
-        /* Install hypervisor and 2x linear p.t. mapings. */
-        memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
-               &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
-               HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
-
-        spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
-            l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
-
-        for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
-            spl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
-            l2e_from_page(virt_to_page(page_get_owner(mfn_to_page(gmfn))->
-                                       arch.mm_perdomain_pt) + i,
-                          __PAGE_HYPERVISOR);
-
-        if ( shadow_mode_translate(d) ) // NB: not external
-        {
-            unsigned long hl2mfn;
-
-            ASSERT(pagetable_get_paddr(d->arch.phys_table));
-            spl2e[l2_table_offset(RO_MPT_VIRT_START)] =
-                l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table),
-                                __PAGE_HYPERVISOR);
-
-            if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
-                hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
-
-            // shadow_mode_translate (but not external) sl2 tables hold a
-            // ref to their hl2.
-            //
-            if ( !get_shadow_ref(hl2mfn) )
-                BUG();
-            
-            spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
-                l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
-        }
-        else
-            spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
-                l2e_from_pfn(gmfn, __PAGE_HYPERVISOR);
-    }
-    else
-    {
-        memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t));        
-    }
-
-    unmap_domain_page(spl2e);
-
-    SH_VLOG("shadow_l2_table(%lx -> %lx)", gmfn, smfn);
-    return smfn;
-}
-
-void shadow_map_l1_into_current_l2(unsigned long va)
-{ 
-    struct vcpu *v = current;
-    struct domain *d = v->domain;
-    l1_pgentry_t *gpl1e, *spl1e;
-    l2_pgentry_t gl2e, sl2e;
-    unsigned long gl1pfn, gl1mfn, sl1mfn;
-    int i, init_table = 0;
-
-    __guest_get_l2e(v, va, &gl2e);
-    ASSERT(l2e_get_flags(gl2e) & _PAGE_PRESENT);
-    gl1pfn = l2e_get_pfn(gl2e);
-
-    if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) )
-    {
-        /* This L1 is NOT already shadowed so we need to shadow it. */
-        SH_VVLOG("4a: l1 not shadowed");
-
-        gl1mfn = gmfn_to_mfn(d, gl1pfn);
-        if ( unlikely(!VALID_MFN(gl1mfn)) )
-        {
-            // Attempt to use an invalid pfn as an L1 page.
-            // XXX this needs to be more graceful!
-            BUG();
-        }
-
-        if ( unlikely(!(sl1mfn =
-                        alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) )
-        {
-            printk("Couldn't alloc an L1 shadow for pfn=%lx mfn=%lx\n",
-                   gl1pfn, gl1mfn);
-            BUG(); /* XXX Need to deal gracefully with failure. */
-        }
-
-        perfc_incrc(shadow_l1_table_count);
-        init_table = 1;
-    }
-    else
-    {
-        /* This L1 is shadowed already, but the L2 entry is missing. */
-        SH_VVLOG("4b: was shadowed, l2 missing (%lx)", sl1mfn);
-    }
-
-#ifndef NDEBUG
-    {
-        l2_pgentry_t old_sl2e;
-        __shadow_get_l2e(v, va, &old_sl2e);
-        ASSERT( !(l2e_get_flags(old_sl2e) & _PAGE_PRESENT) );
-    }
-#endif
-
-    if ( !get_shadow_ref(sl1mfn) )
-        BUG();
-    l2pde_general(d, &gl2e, &sl2e, sl1mfn);
-    __guest_set_l2e(v, va, gl2e);
-    __shadow_set_l2e(v, va, sl2e);
-
-    if ( init_table )
-    {
-        l1_pgentry_t sl1e;
-        int index = l1_table_offset(va);
-        int min = 1, max = 0;
-
-        gpl1e = &(linear_pg_table[l1_linear_offset(va) &
-                              ~(L1_PAGETABLE_ENTRIES-1)]);
-
-        spl1e = &(shadow_linear_pg_table[l1_linear_offset(va) &
-                                     ~(L1_PAGETABLE_ENTRIES-1)]);
-
-        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
-        {
-            l1pte_propagate_from_guest(d, gpl1e[i], &sl1e);
-            if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) &&
-                 unlikely(!shadow_get_page_from_l1e(sl1e, d)) )
-                sl1e = l1e_empty();
-            if ( l1e_get_flags(sl1e) == 0 )
-            {
-                // First copy entries from 0 until first invalid.
-                // Then copy entries from index until first invalid.
-                //
-                if ( i < index ) {
-                    i = index - 1;
-                    continue;
-                }
-                break;
-            }
-            spl1e[i] = sl1e;
-            if ( unlikely(i < min) )
-                min = i;
-            if ( likely(i > max) )
-                max = i;
-            set_guest_back_ptr(d, sl1e, sl1mfn, i);
-        }
-
-        mfn_to_page(sl1mfn)->tlbflush_timestamp =
-            SHADOW_ENCODE_MIN_MAX(min, max);
-    }
-}
-
-void shadow_invlpg(struct vcpu *v, unsigned long va)
-{
-    struct domain *d = v->domain;
-    l1_pgentry_t gpte, spte;
-
-    ASSERT(shadow_mode_enabled(d));
-
-    shadow_lock(d);
-
-    __shadow_sync_va(v, va);
-
-    // XXX mafetter: will need to think about 4MB pages...
-
-    // It's not strictly necessary to update the shadow here,
-    // but it might save a fault later.
-    //
-    if (__copy_from_user(&gpte, &linear_pg_table[va >> PAGE_SHIFT],
-                         sizeof(gpte))) {
-        perfc_incrc(shadow_invlpg_faults);
-        shadow_unlock(d);
-        return;
-    }
-    l1pte_propagate_from_guest(d, gpte, &spte);
-    shadow_set_l1e(va, spte, 1);
-
-    shadow_unlock(d);
-}
-
-struct out_of_sync_entry *
-shadow_alloc_oos_entry(struct domain *d)
-{
-    struct out_of_sync_entry *f, *extra;
-    unsigned size, i;
-
-    if ( unlikely(d->arch.out_of_sync_free == NULL) )
-    {
-        FSH_LOG("Allocate more fullshadow tuple blocks.");
-
-        size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f));
-        extra = xmalloc_bytes(size);
-
-        /* XXX Should be more graceful here. */
-        if ( extra == NULL )
-            BUG();
-
-        memset(extra, 0, size);
-
-        /* Record the allocation block so it can be correctly freed later. */
-        d->arch.out_of_sync_extras_count++;
-        *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) = 
-            d->arch.out_of_sync_extras;
-        d->arch.out_of_sync_extras = &extra[0];
-
-        /* Thread a free chain through the newly-allocated nodes. */
-        for ( i = 0; i < (out_of_sync_extra_size - 1); i++ )
-            extra[i].next = &extra[i+1];
-        extra[i].next = NULL;
-
-        /* Add the new nodes to the free list. */
-        d->arch.out_of_sync_free = &extra[0];
-    }
-
-    /* Allocate a new node from the quicklist. */
-    f = d->arch.out_of_sync_free;
-    d->arch.out_of_sync_free = f->next;
-
-    return f;
-}
-
-static inline unsigned long
-shadow_make_snapshot(
-    struct domain *d, unsigned long gpfn, unsigned long gmfn)
-{
-    unsigned long smfn, sl1mfn = 0;
-    void *original, *snapshot;
-    u32 min_max = 0;
-    int min, max, length;
-
-    if ( test_and_set_bit(_PGC_out_of_sync, &mfn_to_page(gmfn)->count_info) )
-    {
-        ASSERT(__shadow_status(d, gpfn, PGT_snapshot));
-        return SHADOW_SNAPSHOT_ELSEWHERE;
-    }
-
-    perfc_incrc(shadow_make_snapshot);
-
-    if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) )
-    {
-        printk("Couldn't alloc fullshadow snapshot for pfn=%lx mfn=%lx!\n"
-               "Dom%d snapshot_count_count=%d\n",
-               gpfn, gmfn, d->domain_id, d->arch.snapshot_page_count);
-        BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */
-    }
-
-    if ( !get_shadow_ref(smfn) )
-        BUG();
-
-    if ( shadow_mode_refcounts(d) &&
-         (shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow) )
-        min_max = mfn_to_page(sl1mfn)->tlbflush_timestamp;
-    mfn_to_page(smfn)->tlbflush_timestamp = min_max;
-
-    min = SHADOW_MIN(min_max);
-    max = SHADOW_MAX(min_max);
-    length = max - min + 1;
-    perfc_incr_histo(snapshot_copies, length, PT_UPDATES);
-
-    min *= sizeof(l1_pgentry_t);
-    length *= sizeof(l1_pgentry_t);
-
-    original = map_domain_page(gmfn);
-    snapshot = map_domain_page(smfn);
-    memcpy(snapshot + min, original + min, length);
-    unmap_domain_page(original);
-    unmap_domain_page(snapshot);
-
-    return smfn;
-}
-
-static void
-shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry)
-{
-    void *snapshot;
-
-    if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
-        return;
-
-    // Clear the out_of_sync bit.
-    //
-    clear_bit(_PGC_out_of_sync, &mfn_to_page(entry->gmfn)->count_info);
-
-    // XXX Need to think about how to protect the domain's
-    // information less expensively.
-    //
-    snapshot = map_domain_page(entry->snapshot_mfn);
-    memset(snapshot, 0, PAGE_SIZE);
-    unmap_domain_page(snapshot);
-
-    put_shadow_ref(entry->snapshot_mfn);
-}
-
-struct out_of_sync_entry *
-__shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
-                             unsigned long mfn)
-{
-    struct domain *d = v->domain;
-    struct page_info *page = mfn_to_page(mfn);
-    struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d);
-
-    ASSERT(shadow_lock_is_acquired(d));
-    ASSERT(mfn_valid(mfn));
-
-#ifndef NDEBUG
-    {
-        u32 type = page->u.inuse.type_info & PGT_type_mask;
-        if ( shadow_mode_refcounts(d) )
-        {
-            ASSERT(type == PGT_writable_page);
-        }
-        else
-        {
-            ASSERT(type && (type < PGT_l4_page_table));
-        }
-    }
-#endif
-
-    FSH_LOG("%s(gpfn=%lx, mfn=%lx) c=%08x t=%08lx", __func__,
-            gpfn, mfn, page->count_info, page->u.inuse.type_info);
-
-    // XXX this will require some more thought...  Cross-domain sharing and
-    //     modification of page tables?  Hmm...
-    //
-    if ( d != page_get_owner(page) )
-        BUG();
-
-    perfc_incrc(shadow_mark_mfn_out_of_sync_calls);
-
-    entry->v = v;
-    entry->gpfn = gpfn;
-    entry->gmfn = mfn;
-    entry->writable_pl1e = -1;
-
-#if SHADOW_DEBUG
-    mark_shadows_as_reflecting_snapshot(d, gpfn);
-#endif
-
-    // increment guest's ref count to represent the entry in the
-    // full shadow out-of-sync list.
-    //
-    get_page(page, d);
-
-    return entry;
-}
-
-struct out_of_sync_entry *
-shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
-                             unsigned long mfn)
-{
-    struct out_of_sync_entry *entry =
-      __shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
-    struct domain *d = v->domain;
-
-    entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
-    // Add to the out-of-sync list
-    //
-    entry->next = d->arch.out_of_sync;
-    d->arch.out_of_sync = entry;
-
-    return entry;
-}
-
-void shadow_mark_va_out_of_sync(
-    struct vcpu *v, unsigned long gpfn, unsigned long mfn, unsigned long va)
-{
-    struct out_of_sync_entry *entry =
-        __shadow_mark_mfn_out_of_sync(v, gpfn, mfn);
-    l2_pgentry_t sl2e;
-    struct domain *d = v->domain;
-
-    // We need the address of shadow PTE that maps @va.
-    // It might not exist yet.  Make sure it's there.
-    //
-    __shadow_get_l2e(v, va, &sl2e);
-    if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
-    {
-        // either this L1 isn't shadowed yet, or the shadow isn't linked into
-        // the current L2.
-        shadow_map_l1_into_current_l2(va);
-        __shadow_get_l2e(v, va, &sl2e);
-    }
-    ASSERT(l2e_get_flags(sl2e) & _PAGE_PRESENT);
-
-    entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn);
-    // NB: this is stored as a machine address.
-    entry->writable_pl1e =
-        l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * l1_table_offset(va));
-    ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) );
-    entry->va = va;
-
-    // Increment shadow's page count to represent the reference
-    // inherent in entry->writable_pl1e
-    //
-    if ( !get_shadow_ref(l2e_get_pfn(sl2e)) )
-        BUG();
-
-    // Add to the out-of-sync list
-    //
-    entry->next = d->arch.out_of_sync;
-    d->arch.out_of_sync = entry;
-
-    FSH_LOG("%s(va=%lx -> writable_pl1e=%lx)",
-            __func__, va, entry->writable_pl1e);
-}
-
-/*
- * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches.
- * Returns 0 otherwise.
- */
-static int snapshot_entry_matches(
-    struct domain *d, l1_pgentry_t *guest_pt,
-    unsigned long gpfn, unsigned index)
-{
-    unsigned long smfn = __shadow_status(d, gpfn, PGT_snapshot);
-    l1_pgentry_t *snapshot, gpte; // could be L1s or L2s or ...
-    int entries_match;
-
-    perfc_incrc(snapshot_entry_matches_calls);
-
-    if ( !smfn )
-        return 0;
-
-    snapshot = map_domain_page(smfn);
-
-    if (__copy_from_user(&gpte, &guest_pt[index],
-                         sizeof(gpte))) {
-        unmap_domain_page(snapshot);
-        return 0;
-    }
-
-    // This could probably be smarter, but this is sufficent for
-    // our current needs.
-    //
-    entries_match = !l1e_has_changed(gpte, snapshot[index],
-                                     PAGE_FLAG_MASK);
-
-    unmap_domain_page(snapshot);
-
-#ifdef PERF_COUNTERS
-    if ( entries_match )
-        perfc_incrc(snapshot_entry_matches_true);
-#endif
-
-    return entries_match;
-}
-
-/*
- * Returns 1 if va's shadow mapping is out-of-sync.
- * Returns 0 otherwise.
- */
-int __shadow_out_of_sync(struct vcpu *v, unsigned long va)
-{
-    struct domain *d = v->domain;
-    unsigned long l2mfn = pagetable_get_pfn(v->arch.guest_table);
-    unsigned long l2pfn = mfn_to_gmfn(d, l2mfn);
-    l2_pgentry_t l2e;
-    unsigned long l1pfn, l1mfn;
-
-    ASSERT(shadow_lock_is_acquired(d));
-    ASSERT(VALID_M2P(l2pfn));
-
-    perfc_incrc(shadow_out_of_sync_calls);
-
-    if ( page_out_of_sync(mfn_to_page(l2mfn)) &&
-         !snapshot_entry_matches(d, (l1_pgentry_t *)v->arch.guest_vtable,
-                                 l2pfn, l2_table_offset(va)) )
-        return 1;
-
-    __guest_get_l2e(v, va, &l2e);
-    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
-        return 0;
-
-    l1pfn = l2e_get_pfn(l2e);
-    l1mfn = gmfn_to_mfn(d, l1pfn);
-
-    // If the l1 pfn is invalid, it can't be out of sync...
-    if ( !VALID_MFN(l1mfn) )
-        return 0;
-
-    if ( page_out_of_sync(mfn_to_page(l1mfn)) &&
-         !snapshot_entry_matches(
-             d, &linear_pg_table[l1_linear_offset(va) & ~(L1_PAGETABLE_ENTRIES-1)],
-             l1pfn, l1_table_offset(va)) )
-        return 1;
-
-    return 0;
-}
-
-#define GPFN_TO_GPTEPAGE(_gpfn) ((_gpfn) / (PAGE_SIZE / sizeof(l1_pgentry_t)))
-static inline unsigned long
-predict_writable_pte_page(struct domain *d, unsigned long gpfn)
-{
-    return __shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), PGT_writable_pred);
-}
-
-static inline void
-increase_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
-{
-    unsigned long score = prediction & PGT_score_mask;
-    int create = (score == 0);
-
-    // saturating addition
-    score = (score + (1u << PGT_score_shift)) & PGT_score_mask;
-    score = score ? score : PGT_score_mask;
-
-    prediction = (prediction & PGT_mfn_mask) | score;
-
-    //printk("increase gpfn=%lx pred=%lx create=%d\n", gpfn, prediction, create);
-    set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred, 0);
-
-    if ( create )
-        perfc_incr(writable_pte_predictions);
-}
-
-static inline void
-decrease_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction)
-{
-    unsigned long score = prediction & PGT_score_mask;
-    ASSERT(score);
-
-    // divide score by 2...  We don't like bad predictions.
-    //
-    score = (score >> 1) & PGT_score_mask;
-
-    prediction = (prediction & PGT_mfn_mask) | score;
-
-    //printk("decrease gpfn=%lx pred=%lx score=%lx\n", gpfn, prediction, score);
-
-    if ( score )
-        set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred, 0);
-    else
-    {
-        delete_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, PGT_writable_pred, 0);
-        perfc_decr(writable_pte_predictions);
-    }
-}
-
-static void
-free_writable_pte_predictions(struct domain *d)
-{
-    int i;
-    struct shadow_status *x;
-
-    for ( i = 0; i < shadow_ht_buckets; i++ )
-    {
-        u32 count;
-        unsigned long *gpfn_list;
-
-        /* Skip empty buckets. */
-        if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
-            continue;
-
-        count = 0;
-        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
-            if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
-                count++;
-
-        gpfn_list = xmalloc_array(unsigned long, count);
-        count = 0;
-        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
-            if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
-                gpfn_list[count++] = x->gpfn_and_flags & PGT_mfn_mask;
-
-        while ( count )
-        {
-            count--;
-            /* delete_shadow_status() may do a shadow_audit(), so we need to
-             * keep an accurate count of writable_pte_predictions to keep it
-             * happy.
-             */
-            delete_shadow_status(d, gpfn_list[count], 0, PGT_writable_pred, 0);
-            perfc_decr(writable_pte_predictions);
-        }
-
-        xfree(gpfn_list);
-    }
-}
-
-static int fix_entry(
-    struct domain *d, 
-    l1_pgentry_t *pt, u32 *found, int is_l1_shadow, u32 max_refs_to_find)
-{
-    l1_pgentry_t old = *pt;
-    l1_pgentry_t new = old;
-
-    l1e_remove_flags(new,_PAGE_RW);
-    if ( is_l1_shadow && !shadow_get_page_from_l1e(new, d) )
-        BUG();
-    (*found)++;
-    *pt = new;
-    if ( is_l1_shadow )
-        shadow_put_page_from_l1e(old, d);
-
-    return (*found == max_refs_to_find);
-}
-
-static u32 remove_all_write_access_in_ptpage(
-    struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn,
-    unsigned long readonly_gpfn, unsigned long readonly_gmfn,
-    u32 max_refs_to_find, unsigned long prediction)
-{
-    l1_pgentry_t *pt = map_domain_page(pt_mfn);
-    l1_pgentry_t match;
-    unsigned long flags = _PAGE_RW | _PAGE_PRESENT;
-    int i;
-    u32 found = 0;
-    int is_l1_shadow =
-        ((mfn_to_page(pt_mfn)->u.inuse.type_info & PGT_type_mask) ==
-         PGT_l1_shadow);
-
-    match = l1e_from_pfn(readonly_gmfn, flags);
-
-    if ( shadow_mode_external(d) ) {
-        i = (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_va_mask) 
-            >> PGT_va_shift;
-
-        if ( (i >= 0 && i < L1_PAGETABLE_ENTRIES) &&
-             !l1e_has_changed(pt[i], match, flags) && 
-             fix_entry(d, &pt[i], &found, is_l1_shadow, max_refs_to_find) &&
-             !prediction )
-            goto out;
-    }
-
-    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
-    {
-        if ( unlikely(!l1e_has_changed(pt[i], match, flags)) && 
-             fix_entry(d, &pt[i], &found, is_l1_shadow, max_refs_to_find) )
-            break;
-    }
-
-out:
-    unmap_domain_page(pt);
-
-    return found;
-}
-
-int shadow_remove_all_write_access(
-    struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
-{
-    int i;
-    struct shadow_status *a;
-    u32 found = 0, write_refs;
-    unsigned long predicted_smfn;
-
-    ASSERT(shadow_lock_is_acquired(d));
-    ASSERT(VALID_MFN(readonly_gmfn));
-
-    perfc_incrc(remove_write_access);
-
-    // If it's not a writable page, then no writable refs can be outstanding.
-    //
-    if ( (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_type_mask) !=
-         PGT_writable_page )
-    {
-        perfc_incrc(remove_write_not_writable);
-        return 1;
-    }
-
-    // How many outstanding writable PTEs for this page are there?
-    //
-    write_refs =
-        (mfn_to_page(readonly_gmfn)->u.inuse.type_info & PGT_count_mask);
-    if ( write_refs && MFN_PINNED(readonly_gmfn) )
-    {
-        write_refs--;
-    }
-
-    if ( write_refs == 0 )
-    {
-        perfc_incrc(remove_write_no_work);
-        return 1;
-    }
-    
-    if ( shadow_mode_external(d) ) {
-        if (--write_refs == 0) 
-            return 0;
-
-         // Use the back pointer to locate the shadow page that can contain
-         // the PTE of interest
-         if ( (predicted_smfn = mfn_to_page(readonly_gmfn)->tlbflush_timestamp) ) {
-             found += remove_all_write_access_in_ptpage(
-                 d, predicted_smfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, 0);
-             if ( found == write_refs )
-                 return 0;
-         }
-    }
-
-    // Search all the shadow L1 page tables...
-    //
-    for (i = 0; i < shadow_ht_buckets; i++)
-    {
-        a = &d->arch.shadow_ht[i];
-        while ( a && a->gpfn_and_flags )
-        {
-            if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow )
-            {
-                found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask);
-                if ( found == write_refs )
-                    return 0;
-            }
-
-            a = a->next;
-        }
-    }
-
-    FSH_LOG("%s: looking for %d refs, found %d refs",
-            __func__, write_refs, found);
-
-    return 0;
-}
-
-static u32 remove_all_access_in_page(
-    struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn)
-{
-    l1_pgentry_t *pl1e = map_domain_page(l1mfn);
-    l1_pgentry_t match, ol2e;
-    unsigned long flags  = _PAGE_PRESENT;
-    int i;
-    u32 count = 0;
-    int is_l1_shadow =
-        ((mfn_to_page(l1mfn)->u.inuse.type_info & PGT_type_mask) ==
-         PGT_l1_shadow);
-
-    match = l1e_from_pfn(forbidden_gmfn, flags);
-    
-    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
-    {
-        if ( l1e_has_changed(pl1e[i], match, flags) )
-            continue;
-
-        ol2e = pl1e[i];
-        pl1e[i] = l1e_empty();
-        count++;
-
-        if ( is_l1_shadow )
-            shadow_put_page_from_l1e(ol2e, d);
-        else /* must be an hl2 page */
-            put_page(mfn_to_page(forbidden_gmfn));
-    }
-
-    unmap_domain_page(pl1e);
-
-    return count;
-}
-
-u32 shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn)
-{
-    int i;
-    struct shadow_status *a;
-    u32 count = 0;
-
-    if ( unlikely(!shadow_mode_enabled(d)) )
-        return 0;
-
-    ASSERT(shadow_lock_is_acquired(d));
-    perfc_incrc(remove_all_access);
-
-    for (i = 0; i < shadow_ht_buckets; i++)
-    {
-        a = &d->arch.shadow_ht[i];
-        while ( a && a->gpfn_and_flags )
-        {
-            switch (a->gpfn_and_flags & PGT_type_mask)
-            {
-            case PGT_l1_shadow:
-            case PGT_l2_shadow:
-            case PGT_l3_shadow:
-            case PGT_l4_shadow:
-            case PGT_hl2_shadow:
-                count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn);
-                break;
-            case PGT_snapshot:
-            case PGT_writable_pred:
-                // these can't hold refs to the forbidden page
-                break;
-            default:
-                BUG();
-            }
-
-            a = a->next;
-        }
-    }
-
-    return count;
-}    
-
-static int resync_all(struct domain *d, u32 stype)
-{
-    struct out_of_sync_entry *entry;
-    unsigned i;
-    unsigned long smfn;
-    void *guest, *shadow, *snapshot;
-    int need_flush = 0, external = shadow_mode_external(d);
-    int unshadow;
-    int changed;
-    u32 min_max_shadow, min_max_snapshot;
-    int min_shadow, max_shadow, min_snapshot, max_snapshot;
-    struct vcpu *v;
-
-    ASSERT(shadow_lock_is_acquired(d));
-
-    for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
-    {
-        if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
-            continue;
-
-        smfn = __shadow_status(d, entry->gpfn, stype);
-
-        if ( !smfn )
-        {
-            // For heavy weight shadows: no need to update refcounts if
-            // there's no shadow page.
-            //
-            if ( shadow_mode_refcounts(d) )
-                continue;
-
-            // For light weight shadows: only need up resync the refcounts to
-            // the new contents of the guest page iff this it has the right
-            // page type.
-            //
-            if ( stype != ( mfn_to_page(entry->gmfn)->u.inuse.type_info & PGT_type_mask) )
-                continue;
-        }
-
-        FSH_LOG("resyncing t=%08x gpfn=%lx gmfn=%lx smfn=%lx snapshot_mfn=%lx",
-                stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn);
-
-        // Compare guest's new contents to its snapshot, validating
-        // and updating its shadow as appropriate.
-        //
-        guest    = map_domain_page(entry->gmfn);
-        snapshot = map_domain_page(entry->snapshot_mfn);
-
-        if ( smfn )
-            shadow = map_domain_page(smfn);
-        else
-            shadow = NULL;
-
-        unshadow = 0;
-
-        switch ( stype ) {
-        case PGT_l1_shadow:
-        {
-            l1_pgentry_t *guest1 = guest;
-            l1_pgentry_t *shadow1 = shadow;
-            l1_pgentry_t *snapshot1 = snapshot;
-            int unshadow_l1 = 0;
-
-            ASSERT(shadow_mode_write_l1(d) ||
-                   shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
-
-            if ( !shadow_mode_refcounts(d) )
-                revalidate_l1(d, guest1, snapshot1);
-
-            if ( !smfn )
-                break;
-
-            min_max_shadow = mfn_to_page(smfn)->tlbflush_timestamp;
-            min_shadow     = SHADOW_MIN(min_max_shadow);
-            max_shadow     = SHADOW_MAX(min_max_shadow);
-
-            min_max_snapshot =
-                mfn_to_page(entry->snapshot_mfn)->tlbflush_timestamp;
-            min_snapshot     = SHADOW_MIN(min_max_snapshot);
-            max_snapshot     = SHADOW_MAX(min_max_snapshot);
-
-            changed = 0;
-
-            for ( i = min_shadow; i <= max_shadow; i++ )
-            {
-                if ( (i < min_snapshot) || (i > max_snapshot) ||
-                     l1e_has_changed(guest1[i], snapshot1[i], PAGE_FLAG_MASK) )
-                {
-                    int error;
-
-                    error = validate_pte_change(d, guest1[i], &shadow1[i]);
-                    if ( error ==  -1 ) 
-                        unshadow_l1 = 1;
-                    else {
-                        need_flush |= error;
-                        if ( l1e_get_flags(shadow1[i]) & _PAGE_PRESENT )
-                            set_guest_back_ptr(d, shadow1[i], smfn, i);
-                    }
-
-                    // can't update snapshots of linear page tables -- they
-                    // are used multiple times...
-                    //
-                    // snapshot[i] = new_pte;
-                    changed++;
-                }
-            }
-            perfc_incrc(resync_l1);
-            perfc_incr_histo(wpt_updates, changed, PT_UPDATES);
-            perfc_incr_histo(l1_entries_checked, max_shadow - min_shadow + 1, PT_UPDATES);
-            if (unshadow_l1) {
-                l2_pgentry_t l2e;
-
-                __shadow_get_l2e(entry->v, entry->va, &l2e);
-                if (l2e_get_flags(l2e) & _PAGE_PRESENT) {
-                    put_shadow_ref(l2e_get_pfn(l2e));
-                    l2e = l2e_empty();
-                    __shadow_set_l2e(entry->v, entry->va, l2e);
-
-                    if (entry->v == current)
-                        need_flush = 1;
-                }
-            }
-
-            break;
-        }
-        case PGT_l2_shadow:
-        {
-            int max = -1;
-
-            l2_pgentry_t *guest2 = guest;
-            l2_pgentry_t *shadow2 = shadow;
-            l2_pgentry_t *snapshot2 = snapshot;
-
-            ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
-            BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
-
-            changed = 0;
-            for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
-            {
-                l2_pgentry_t new_pde = guest2[i];
-
-                if ( !is_guest_l2_slot(0,i) && !external )
-                    continue;
-
-                if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK))
-                {
-                    need_flush |= validate_pde_change(d, new_pde, &shadow2[i]);
-
-                    // can't update snapshots of linear page tables -- they
-                    // are used multiple times...
-                    //
-                    // snapshot[i] = new_pde;
-
-                    changed++;
-                }
-                if ( l2e_get_intpte(new_pde) != 0 ) /* FIXME: check flags? */
-                    max = i;
-
-                // XXX - This hack works for linux guests.
-                //       Need a better solution long term.
-                if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) &&
-                     unlikely(l2e_get_intpte(new_pde) != 0) &&
-                     !unshadow && MFN_PINNED(smfn) )
-                    unshadow = 1;
-            }
-            if ( max == -1 )
-                unshadow = 1;
-            perfc_incrc(resync_l2);
-            perfc_incr_histo(shm_l2_updates, changed, PT_UPDATES);
-            break;
-        }
-        case PGT_hl2_shadow:
-        {
-            l2_pgentry_t *guest2 = guest;
-            l2_pgentry_t *snapshot2 = snapshot;
-            l1_pgentry_t *shadow2 = shadow;
-
-            ASSERT(shadow_mode_write_all(d) || shadow_mode_wr_pt_pte(d));
-            BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented
-
-            changed = 0;
-            for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
-            {
-                l2_pgentry_t new_pde = guest2[i];
-
-                if ( !is_guest_l2_slot(0, i) && !external )
-                    continue;
-
-                if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK) )
-                {
-                    need_flush |= validate_hl2e_change(d, new_pde, &shadow2[i]);
-
-                    // can't update snapshots of linear page tables -- they
-                    // are used multiple times...
-                    //
-                    // snapshot[i] = new_pde;
-
-                    changed++;
-                }
-            }
-            perfc_incrc(resync_hl2);
-            perfc_incr_histo(shm_hl2_updates, changed, PT_UPDATES);
-            break;
-        }
-        default:
-            BUG();
-        }
-
-        if ( smfn )
-            unmap_domain_page(shadow);
-        unmap_domain_page(snapshot);
-        unmap_domain_page(guest);
-
-        if ( unlikely(unshadow) )
-        {
-            for_each_vcpu(d, v)
-                if(smfn == pagetable_get_pfn(v->arch.shadow_table))
-                    return need_flush;
-            perfc_incrc(unshadow_l2_count);
-            shadow_unpin(smfn);
-            if ( unlikely(shadow_mode_external(d)) )
-            {
-                unsigned long hl2mfn;
-
-                if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) &&
-                     MFN_PINNED(hl2mfn) )
-                    shadow_unpin(hl2mfn);
-            }
-        }
-    }
-
-    return need_flush;
-}
-
-void __shadow_sync_all(struct domain *d)
-{
-    struct out_of_sync_entry *entry;
-    int need_flush = 0;
-    l1_pgentry_t *ppte, opte, npte;
-    cpumask_t other_vcpus_mask;
-
-    perfc_incrc(shadow_sync_all);
-
-    ASSERT(shadow_lock_is_acquired(d));
-
-    // First, remove all write permissions to the page tables
-    //
-    for ( entry = d->arch.out_of_sync; entry; entry = entry->next)
-    {
-        // Skip entries that have low bits set...  Those aren't
-        // real PTEs.
-        //
-        if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) )
-            continue;
-
-        ppte = (l1_pgentry_t *)(
-            (char *)map_domain_page(entry->writable_pl1e >> PAGE_SHIFT) +
-            (entry->writable_pl1e & ~PAGE_MASK));
-        opte = npte = *ppte;
-        l1e_remove_flags(npte, _PAGE_RW);
-
-        if ( (l1e_get_flags(npte) & _PAGE_PRESENT) &&
-             !shadow_get_page_from_l1e(npte, d) )
-            BUG();
-        *ppte = npte;
-        set_guest_back_ptr(d, npte, (entry->writable_pl1e) >> PAGE_SHIFT, 
-                           (entry->writable_pl1e & ~PAGE_MASK)/sizeof(l1_pgentry_t));
-        shadow_put_page_from_l1e(opte, d);
-
-        unmap_domain_page(ppte);
-    }
-
-    /* Other VCPUs mustn't use the revoked writable mappings. */
-    other_vcpus_mask = d->domain_dirty_cpumask;
-    cpu_clear(smp_processor_id(), other_vcpus_mask);
-    flush_tlb_mask(other_vcpus_mask);
-
-    /* Flush ourself later. */
-    need_flush = 1;
-
-    /* Second, resync all L1 pages, then L2 pages, etc... */
-    need_flush |= resync_all(d, PGT_l1_shadow);
-    if ( shadow_mode_translate(d) )
-        need_flush |= resync_all(d, PGT_hl2_shadow);
-    need_flush |= resync_all(d, PGT_l2_shadow);
-
-    if ( need_flush && !unlikely(shadow_mode_external(d)) )
-        local_flush_tlb();
-
-    free_out_of_sync_state(d);
-}
-
-int shadow_fault(unsigned long va, struct cpu_user_regs *regs)
-{
-    l1_pgentry_t gpte, spte, orig_gpte;
-    struct vcpu *v = current;
-    struct domain *d = v->domain;
-    l2_pgentry_t gpde;
-
-    spte = l1e_empty();
-
-    SH_VVLOG("shadow_fault( va=%lx, code=%lu )",
-             va, (unsigned long)regs->error_code);
-    perfc_incrc(shadow_fault_calls);
-
-    check_pagetable(v, "pre-sf");
-
-    /*
-     * Don't let someone else take the guest's table pages out-of-sync.
-     */
-    shadow_lock(d);
-
-    /* XXX - FIX THIS COMMENT!!!
-     * STEP 1. Check to see if this fault might have been caused by an
-     *         out-of-sync table page entry, or if we should pass this
-     *         fault onto the guest.
-     */
-    __shadow_sync_va(v, va);
-
-    /*
-     * STEP 2. Check the guest PTE.
-     */
-    __guest_get_l2e(v, va, &gpde);
-    if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
-    {
-        SH_VVLOG("shadow_fault - EXIT: L2 not present (%x)",
-                 l2e_get_intpte(gpde));
-        perfc_incrc(shadow_fault_bail_pde_not_present);
-        goto fail;
-    }
-
-    // This can't fault because we hold the shadow lock and we've ensured that
-    // the mapping is in-sync, so the check of the PDE's present bit, above,
-    // covers this access.
-    //
-    if ( __copy_from_user(&gpte,
-                          &linear_pg_table[l1_linear_offset(va)],
-                          sizeof(gpte)) ) {
-        printk("%s() failed, crashing domain %d "
-               "due to a unaccessible linear page table (gpde=%" PRIpte "), va=%lx\n",
-               __func__, d->domain_id, l2e_get_intpte(gpde), va);
-        domain_crash_synchronous();
-    }
-    orig_gpte = gpte;
-
-    if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) )
-    {
-        SH_VVLOG("shadow_fault - EXIT: gpte not present (%" PRIpte ") (gpde %" PRIpte ")",
-                 l1e_get_intpte(gpte),
-                 l2e_get_intpte(gpde));
-        perfc_incrc(shadow_fault_bail_pte_not_present);
-        goto fail;
-    }
-
-    /* Write fault? */
-    if ( regs->error_code & 2 )
-    {
-        int allow_writes = 0;
-
-        if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) )
-        {
-            if ( shadow_mode_page_writable(va, regs, l1e_get_pfn(gpte)) )
-            {
-                allow_writes = 1;
-                l1e_add_flags(gpte, _PAGE_RW);
-            }
-            else
-            {
-                /* Write fault on a read-only mapping. */
-                SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")",
-                         l1e_get_intpte(gpte));
-                perfc_incrc(shadow_fault_bail_ro_mapping);
-                goto fail;
-            }
-        }
-        else if ( unlikely(!shadow_mode_wr_pt_pte(d) && mfn_is_page_table(l1e_get_pfn(gpte))) )
-        {
-            SH_LOG("l1pte_write_fault: no write access to page table page");
-            domain_crash_synchronous();
-        }
-
-        /* User access violation in guest? */
-        if ( unlikely((regs->error_code & 4) &&
-                      !(l1e_get_flags(gpte) & _PAGE_USER)))
-        {
-            SH_VVLOG("shadow_fault - EXIT: wr fault on super page (%" PRIpte ")",
-                    l1e_get_intpte(gpte));
-            goto fail;
-
-        }
-
-        if ( unlikely(!l1pte_write_fault(v, &gpte, &spte, va)) )
-        {
-            SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
-            perfc_incrc(write_fault_bail);
-            shadow_unlock(d);
-            return 0;
-        }
-
-        if ( allow_writes )
-            l1e_remove_flags(gpte, _PAGE_RW);
-    }
-    else
-    {
-        /* Read-protection violation in guest? */
-        if ( unlikely((regs->error_code & 1) ))
-        {
-            SH_VVLOG("shadow_fault - EXIT: read fault on super page (%" PRIpte ")",
-                    l1e_get_intpte(gpte));
-            goto fail;
-
-        }
-
-
-        if ( !l1pte_read_fault(d, &gpte, &spte) )
-        {
-            SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed");
-            perfc_incrc(read_fault_bail);
-            shadow_unlock(d);
-            return 0;
-        }
-    }
-
-    /*
-     * STEP 3. Write the modified shadow PTE and guest PTE back to the tables.
-     */
-    if ( l1e_has_changed(orig_gpte, gpte, PAGE_FLAG_MASK) )
-    {
-        /* XXX Watch out for read-only L2 entries! (not used in Linux). */
-        if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)],
-                                     &gpte, sizeof(gpte))) )
-        {
-            printk("%s() failed, crashing domain %d "
-                   "due to a read-only L2 page table (gpde=%" PRIpte "), va=%lx\n",
-                   __func__,d->domain_id, l2e_get_intpte(gpde), va);
-            domain_crash_synchronous();
-        }
-
-        __mark_dirty(d, gmfn_to_mfn(d, l2e_get_pfn(gpde)));
-    }
-
-    shadow_set_l1e(va, spte, 1);
-
-    perfc_incrc(shadow_fault_fixed);
-    d->arch.shadow_fault_count++;
-
-    shadow_unlock(d);
-
-    check_pagetable(v, "post-sf");
-    return EXCRET_fault_fixed;
-
- fail:
-    shadow_unlock(d);
-    return 0;
-}
-
-void shadow_l1_normal_pt_update(
-    struct domain *d,
-    unsigned long pa, l1_pgentry_t gpte,
-    struct domain_mmap_cache *cache)
-{
-    unsigned long sl1mfn;    
-    l1_pgentry_t *spl1e, spte;
-
-    shadow_lock(d);
-
-    sl1mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l1_shadow);
-    if ( sl1mfn )
-    {
-        SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpte=%" PRIpte,
-                 (void *)pa, l1e_get_intpte(gpte));
-        l1pte_propagate_from_guest(current->domain, gpte, &spte);
-
-        spl1e = map_domain_page_with_cache(sl1mfn, cache);
-        spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = spte;
-        unmap_domain_page_with_cache(spl1e, cache);
-    }
-
-    shadow_unlock(d);
-}
-
-void shadow_l2_normal_pt_update(
-    struct domain *d,
-    unsigned long pa, l2_pgentry_t gpde,
-    struct domain_mmap_cache *cache)
-{
-    unsigned long sl2mfn, hl2mfn;
-    l2_pgentry_t *spl2e;
-    l1_pgentry_t *hl2e;
-
-    shadow_lock(d);
-
-    sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l2_shadow);
-    if ( sl2mfn )
-    {
-        SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%" PRIpte,
-                 (void *)pa, l2e_get_intpte(gpde));
-        spl2e = map_domain_page_with_cache(sl2mfn, cache);
-        validate_pde_change(d, gpde,
-                            &spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)]);
-        unmap_domain_page_with_cache(spl2e, cache);
-    }
-    hl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT,
-                             PGT_hl2_shadow);
-    if ( hl2mfn )
-    {
-        hl2e = map_domain_page(hl2mfn);
-        validate_hl2e_change(d, gpde,
-                             &hl2e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)]);
-        unmap_domain_page(hl2e);
-    }
-
-    shadow_unlock(d);
-}
-
-#if CONFIG_PAGING_LEVELS >= 3
-void shadow_l3_normal_pt_update(
-    struct domain *d,
-    unsigned long pa, l3_pgentry_t gpde,
-    struct domain_mmap_cache *cache)
-{
-    BUG(); // not yet implemented
-}
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 4
-void shadow_l4_normal_pt_update(
-    struct domain *d,
-    unsigned long pa, l4_pgentry_t gpde,
-    struct domain_mmap_cache *cache)
-{
-    BUG(); // not yet implemented
-}
-#endif
-
-int shadow_do_update_va_mapping(unsigned long va,
-                                l1_pgentry_t val,
-                                struct vcpu *v)
-{
-    struct domain *d = v->domain;
-    l1_pgentry_t spte;
-    int rc = 0;
-
-    shadow_lock(d);
-
-    // This is actually overkill - we don't need to sync the L1 itself,
-    // just everything involved in getting to this L1 (i.e. we need
-    // linear_pg_table[l1_linear_offset(va)] to be in sync)...
-    //
-    __shadow_sync_va(v, va);
-
-    l1pte_propagate_from_guest(d, val, &spte);
-    shadow_set_l1e(va, spte, 0);
-
-    /*
-     * If we're in log-dirty mode then we need to note that we've updated
-     * the PTE in the PT-holding page. We need the machine frame number
-     * for this.
-     */
-    __mark_dirty(d, va_to_l1mfn(v, va));
-
-    shadow_unlock(d);
-
-    return rc;
-}
-
-
-/*
- * What lives where in the 32-bit address space in the various shadow modes,
- * and what it uses to get/maintain that mapping.
- *
- * SHADOW MODE:      none         enable         translate         external
- * 
- * 4KB things:
- * guest_vtable    lin_l2     mapped per gl2   lin_l2 via hl2   mapped per gl2
- * shadow_vtable     n/a         sh_lin_l2       sh_lin_l2      mapped per gl2
- * hl2_vtable        n/a            n/a        lin_hl2 via hl2  mapped per gl2
- * monitor_vtable    n/a            n/a             n/a           mapped once
- *
- * 4MB things:
- * guest_linear  lin via gl2    lin via gl2      lin via hl2      lin via hl2
- * shadow_linear     n/a      sh_lin via sl2   sh_lin via sl2   sh_lin via sl2
- * monitor_linear    n/a            n/a             n/a              ???
- * perdomain      perdomain      perdomain       perdomain        perdomain
- * R/O M2P         R/O M2P        R/O M2P           n/a              n/a
- * R/W M2P         R/W M2P        R/W M2P         R/W M2P          R/W M2P
- * P2M               n/a            n/a           R/O M2P          R/O M2P
- *
- * NB:
- * update_pagetables(), __update_pagetables(), shadow_mode_enable(),
- * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
- * all play a part in maintaining these mappings.
- */
-void __update_pagetables(struct vcpu *v)
-{
-    struct domain *d = v->domain;
-    unsigned long gmfn = pagetable_get_pfn(v->arch.guest_table);
-    unsigned long gpfn = mfn_to_gmfn(d, gmfn);
-    unsigned long smfn, hl2mfn, old_smfn;
-    int need_sync = 0;
-
-    int max_mode = ( shadow_mode_external(d) ? SHM_external
-                     : shadow_mode_translate(d) ? SHM_translate
-                     : shadow_mode_enabled(d) ? SHM_enable
-                     : 0 );
-
-    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
-    ASSERT( max_mode );
-
-    /*
-     *  arch.guest_vtable
-     */
-    if ( max_mode & (SHM_enable | SHM_external) )
-    {
-        if ( likely(v->arch.guest_vtable != NULL) )
-            unmap_domain_page_global(v->arch.guest_vtable);
-        v->arch.guest_vtable = map_domain_page_global(gmfn);
-    }
-
-    /*
-     *  arch.shadow_table
-     */
-    if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) )
-        smfn = shadow_l2_table(d, gpfn, gmfn);
-    else
-    {
-        /*
-         *  move sync later in order to avoid this smfn been 
-         *  unshadowed occasionally
-         */
-        need_sync = 1;
-    }
-    if ( !get_shadow_ref(smfn) )
-        BUG();
-    old_smfn = pagetable_get_pfn(v->arch.shadow_table);
-    v->arch.shadow_table = pagetable_from_pfn(smfn);
-    if ( old_smfn )
-        put_shadow_ref(old_smfn);
-
-    SH_VVLOG("__update_pagetables(gmfn=%lx, smfn=%lx)", gmfn, smfn);
-
-    /*
-     * arch.shadow_vtable
-     */
-    if ( max_mode == SHM_external )
-    {
-        if ( v->arch.shadow_vtable )
-            unmap_domain_page_global(v->arch.shadow_vtable);
-        v->arch.shadow_vtable = map_domain_page_global(smfn);
-    }
-
-    /*
-     * arch.hl2_vtable
-     */
-
-    // if max_mode == SHM_translate, then the hl2 is already installed
-    // correctly in its smfn, and there's nothing to do.
-    //
-    if ( max_mode == SHM_external )
-    {
-        if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
-            hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
-        if ( v->arch.hl2_vtable )
-            unmap_domain_page_global(v->arch.hl2_vtable);
-        v->arch.hl2_vtable = map_domain_page_global(hl2mfn);
-    }
-
-    /*
-     * fixup pointers in monitor table, as necessary
-     */
-    if ( max_mode == SHM_external )
-    {
-        l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
-        l2_pgentry_t old_hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
-        l2_pgentry_t old_sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
-
-        ASSERT( shadow_mode_translate(d) );
-
-        if ( !get_shadow_ref(hl2mfn) )
-            BUG();
-        mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
-            l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR);
-        if ( l2e_get_flags(old_hl2e) & _PAGE_PRESENT )
-            put_shadow_ref(l2e_get_pfn(old_hl2e));
-
-        if ( !get_shadow_ref(smfn) )
-            BUG();
-        mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
-            l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
-        if ( l2e_get_flags(old_sl2e) & _PAGE_PRESENT )
-            put_shadow_ref(l2e_get_pfn(old_sl2e));
-
-        // XXX - maybe this can be optimized somewhat??
-        local_flush_tlb();
-    }
-
-    if(likely(need_sync))
-        shadow_sync_all(d);
-}
-
-void clear_all_shadow_status(struct domain *d)
-{
-    struct vcpu *v = current;
-
-    /*
-     * Don't clean up while other vcpus are working.
-     */
-    if ( v->vcpu_id )
-        return;
-
-    shadow_lock(d);
-
-    free_shadow_pages(d);
-    free_shadow_ht_entries(d);
-    d->arch.shadow_ht =
-        xmalloc_array(struct shadow_status, shadow_ht_buckets);
-    if ( d->arch.shadow_ht == NULL ) {
-        printk("clear all shadow status: xmalloc failed\n");
-        domain_crash_synchronous();
-    }
-    memset(d->arch.shadow_ht, 0,
-           shadow_ht_buckets * sizeof(struct shadow_status));
-
-    free_out_of_sync_entries(d);
-
-    shadow_unlock(d);
-}
-
-/************************************************************************/
-/************************************************************************/
-/************************************************************************/
-
-#if SHADOW_DEBUG
-
-// The following is entirely for _check_pagetable()'s benefit.
-// _check_pagetable() wants to know whether a given entry in a
-// shadow page table is supposed to be the shadow of the guest's
-// current entry, or the shadow of the entry held in the snapshot
-// taken above.
-//
-// Here, we mark all currently existing entries as reflecting
-// the snapshot, above.  All other places in xen that update
-// the shadow will keep the shadow in sync with the guest's
-// entries (via l1pte_propagate_from_guest and friends), which clear
-// the SHADOW_REFLECTS_SNAPSHOT bit.
-//
-static void
-mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn)
-{
-    unsigned long smfn;
-    l1_pgentry_t *l1e;
-    l2_pgentry_t *l2e;
-    unsigned i;
-
-    if ( (smfn = __shadow_status(d, gpfn, PGT_l1_shadow)) )
-    {
-        l1e = map_domain_page(smfn);
-        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
-            if ( is_guest_l1_slot(i) &&
-                 (l1e_get_flags(l1e[i]) & _PAGE_PRESENT) )
-                l1e_add_flags(l1e[i], SHADOW_REFLECTS_SNAPSHOT);
-        unmap_domain_page(l1e);
-    }
-
-    if ( (smfn = __shadow_status(d, gpfn, PGT_l2_shadow)) )
-    {
-        l2e = map_domain_page(smfn);
-        for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
-            if ( is_guest_l2_slot(0, i) &&
-                 (l2e_get_flags(l2e[i]) & _PAGE_PRESENT) )
-                l2e_add_flags(l2e[i], SHADOW_REFLECTS_SNAPSHOT);
-        unmap_domain_page(l2e);
-    }
-}
-
-// BUG: these are not SMP safe...
-static int sh_l2_present;
-static int sh_l1_present;
-static char *sh_check_name;
-int shadow_status_noswap;
-
-#define v2m(_v, _adr) ({                                                     \
-    unsigned long _a  = (unsigned long)(_adr);                               \
-    l2_pgentry_t _pde = shadow_linear_l2_table(_v)[l2_table_offset(_a)];     \
-    unsigned long _pa = -1;                                                  \
-    if ( l2e_get_flags(_pde) & _PAGE_PRESENT )                               \
-    {                                                                        \
-        l1_pgentry_t _pte;                                                   \
-        _pte = shadow_linear_pg_table[l1_linear_offset(_a)];                 \
-        if ( l1e_get_flags(_pte) & _PAGE_PRESENT )                           \
-            _pa = l1e_get_paddr(_pte);                                       \
-    }                                                                        \
-    _pa | (_a & ~PAGE_MASK);                                                 \
-})
-
-#define FAIL(_f, _a...)                                                      \
-    do {                                                                     \
-        printk("XXX %s-FAIL (%d,%d,%d) " _f " at %s(%d)\n",                  \
-               sh_check_name, level, l2_idx, l1_idx, ## _a,                  \
-               __FILE__, __LINE__);                                          \
-        printk("guest_pte=%" PRIpte " eff_guest_pte=%" PRIpte                \
-               " shadow_pte=%" PRIpte " snapshot_pte=%" PRIpte               \
-               " &guest=%p &shadow=%p &snap=%p v2m(&guest)=%p"               \
-               " v2m(&shadow)=%p v2m(&snap)=%p ea=%08x\n",                   \
-               l1e_get_intpte(guest_pte), l1e_get_intpte(eff_guest_pte),     \
-               l1e_get_intpte(shadow_pte), l1e_get_intpte(snapshot_pte),     \
-               p_guest_pte, p_shadow_pte, p_snapshot_pte,                    \
-               (void *)v2m(v, p_guest_pte), (void *)v2m(v, p_shadow_pte),    \
-               (void *)v2m(v, p_snapshot_pte),                               \
-               (l2_idx << L2_PAGETABLE_SHIFT) |                              \
-               (l1_idx << L1_PAGETABLE_SHIFT));                              \
-        errors++;                                                            \
-    } while ( 0 )
-
-static int check_pte(
-    struct vcpu *v,
-    l1_pgentry_t *p_guest_pte,
-    l1_pgentry_t *p_shadow_pte,
-    l1_pgentry_t *p_snapshot_pte,
-    int level, int l2_idx, int l1_idx)
-{
-    struct domain *d = v->domain;
-    l1_pgentry_t guest_pte = *p_guest_pte;
-    l1_pgentry_t shadow_pte = *p_shadow_pte;
-    l1_pgentry_t snapshot_pte = p_snapshot_pte ? *p_snapshot_pte : l1e_empty();
-    l1_pgentry_t eff_guest_pte = l1e_empty();
-    unsigned long mask, eff_guest_pfn, eff_guest_mfn, shadow_mfn;
-    int errors = 0, guest_writable;
-    int page_table_page;
-
-    if ( (l1e_get_intpte(shadow_pte) == 0) ||
-         (l1e_get_intpte(shadow_pte) == 0xdeadface) ||
-         (l1e_get_intpte(shadow_pte) == 0x00000E00) )
-        return errors;  /* always safe */
-
-    if ( !(l1e_get_flags(shadow_pte) & _PAGE_PRESENT) )
-        FAIL("Non zero not present shadow_pte");
-
-    if ( level == 2 ) sh_l2_present++;
-    if ( level == 1 ) sh_l1_present++;
-
-    if ( (l1e_get_flags(shadow_pte) & SHADOW_REFLECTS_SNAPSHOT) && p_snapshot_pte )
-        eff_guest_pte = snapshot_pte;
-    else
-        eff_guest_pte = guest_pte;
-
-    if ( !(l1e_get_flags(eff_guest_pte) & _PAGE_PRESENT) )
-        FAIL("Guest not present yet shadow is");
-
-    mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_AVAIL|PAGE_MASK);
-
-    if ( ((l1e_get_intpte(shadow_pte) & mask) != (l1e_get_intpte(eff_guest_pte) & mask)) )
-        FAIL("Corrupt?");
-
-    if ( (level == 1) &&
-         (l1e_get_flags(shadow_pte) & _PAGE_DIRTY) &&
-         !(l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY) )
-        FAIL("Dirty coherence");
-
-    if ( (l1e_get_flags(shadow_pte) & _PAGE_ACCESSED) &&
-         !(l1e_get_flags(eff_guest_pte) & _PAGE_ACCESSED) )
-        FAIL("Accessed coherence");
-
-    if ( l1e_get_flags(shadow_pte) & _PAGE_GLOBAL )
-        FAIL("global bit set in shadow");
-
-    eff_guest_pfn = l1e_get_pfn(eff_guest_pte);
-    eff_guest_mfn = gmfn_to_mfn(d, eff_guest_pfn);
-    shadow_mfn = l1e_get_pfn(shadow_pte);
-
-    if ( !VALID_MFN(eff_guest_mfn) && !shadow_mode_refcounts(d) )
-        FAIL("%s: invalid eff_guest_pfn=%lx eff_guest_pte=%" PRIpte "\n",
-             __func__, eff_guest_pfn, l1e_get_intpte(eff_guest_pte));
-
-    page_table_page = mfn_is_page_table(eff_guest_mfn);
-
-    guest_writable =
-        (l1e_get_flags(eff_guest_pte) & _PAGE_RW) ||
-        (shadow_mode_write_l1(d) && (level == 1) && mfn_out_of_sync(eff_guest_mfn));
-
-    if ( (l1e_get_flags(shadow_pte) & _PAGE_RW ) && !guest_writable )
-    {
-        printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=%lx page_table_page=%d\n",
-               eff_guest_pfn, eff_guest_mfn, shadow_mfn,
-               mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
-               page_table_page);
-        FAIL("RW coherence");
-    }
-
-    if ( (level == 1) &&
-         (l1e_get_flags(shadow_pte) & _PAGE_RW ) &&
-         !(guest_writable && (l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY)) )
-    {
-        printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=%lx page_table_page=%d\n",
-               eff_guest_pfn, eff_guest_mfn, shadow_mfn,
-               mfn_to_page(eff_guest_mfn)->u.inuse.type_info,
-               page_table_page);
-        FAIL("RW2 coherence");
-    }
-    if ( eff_guest_mfn == shadow_mfn )
-    {
-        if ( level > 1 )
-            FAIL("Linear map ???");    /* XXX this will fail on BSD */
-    }
-    else
-    {
-        if ( level < 2 )
-            FAIL("Shadow in L1 entry?");
-
-        if ( level == 2 )
-        {
-            if ( __shadow_status(d, eff_guest_pfn, PGT_l1_shadow) != shadow_mfn )
-                FAIL("shadow_mfn problem eff_guest_pfn=%lx shadow_mfn=%lx", eff_guest_pfn,
-                     __shadow_status(d, eff_guest_pfn, PGT_l1_shadow));
-        }
-        else
-            BUG(); // XXX -- not handled yet.
-    }
-
-    return errors;
-}
-#undef FAIL
-#undef v2m
-
-static int check_l1_table(
-    struct vcpu *v, unsigned long gpfn,
-    unsigned long gmfn, unsigned long smfn, unsigned l2_idx)
-{
-    struct domain *d = v->domain;
-    int i;
-    unsigned long snapshot_mfn;
-    l1_pgentry_t *p_guest, *p_shadow, *p_snapshot = NULL;
-    int errors = 0;
-
-    if ( page_out_of_sync(mfn_to_page(gmfn)) )
-    {
-        snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot);
-        ASSERT(snapshot_mfn);
-        p_snapshot = map_domain_page(snapshot_mfn);
-    }
-
-    p_guest  = map_domain_page(gmfn);
-    p_shadow = map_domain_page(smfn);
-
-    for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
-        errors += check_pte(v, p_guest+i, p_shadow+i,
-                            p_snapshot ? p_snapshot+i : NULL,
-                            1, l2_idx, i);
-    unmap_domain_page(p_shadow);
-    unmap_domain_page(p_guest);
-    if ( p_snapshot )
-        unmap_domain_page(p_snapshot);
-
-    return errors;
-}
-
-#define FAILPT(_f, _a...)                                         \
-    do {                                                          \
-        printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \
-        errors++;                                                 \
-    } while ( 0 )
-
-int check_l2_table(
-    struct vcpu *v, unsigned long gmfn, unsigned long smfn, int oos_pdes)
-{
-    struct domain *d = v->domain;
-    l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_page(gmfn);
-    l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_page(smfn);
-    l2_pgentry_t match;
-    int i;
-    int errors = 0;
-    int limit;
-
-    if ( !oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != d) )
-        FAILPT("domain doesn't own page");
-    if ( oos_pdes && (page_get_owner(mfn_to_page(gmfn)) != NULL) )
-        FAILPT("bogus owner for snapshot page");
-    if ( page_get_owner(mfn_to_page(smfn)) != NULL )
-        FAILPT("shadow page mfn=0x%lx is owned by someone, domid=%d",
-               smfn, page_get_owner(mfn_to_page(smfn))->domain_id);
-
-#if 0
-    if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
-                &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
-                ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
-                 DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) )
-    {
-        for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-              i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT);
-              i++ )
-            printk("+++ (%d) %lx %lx\n",i,
-                   l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]));
-        FAILPT("hypervisor entries inconsistent");
-    }
-
-    if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != 
-          l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
-        FAILPT("hypervisor linear map inconsistent");
-#endif
-
-    match = l2e_from_pfn(smfn, __PAGE_HYPERVISOR);
-    if ( !shadow_mode_external(d) &&
-         l2e_has_changed(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT],
-                         match, PAGE_FLAG_MASK))
-    {
-        FAILPT("hypervisor shadow linear map inconsistent %" PRIpte " %" PRIpte,
-               l2e_get_intpte(spl2e[SH_LINEAR_PT_VIRT_START >>
-                                   L2_PAGETABLE_SHIFT]),
-               l2e_get_intpte(match));
-    }
-
-    match = l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
-    if ( !shadow_mode_external(d) &&
-         l2e_has_changed(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT],
-                         match, PAGE_FLAG_MASK))
-    {
-        FAILPT("hypervisor per-domain map inconsistent saw %" PRIpte ", expected (va=%p) %" PRIpte,
-               l2e_get_intpte(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]),
-               d->arch.mm_perdomain_pt,
-               l2e_get_intpte(match));
-    }
-
-    if ( shadow_mode_external(d) )
-        limit = L2_PAGETABLE_ENTRIES;
-    else
-        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-
-    /* Check the whole L2. */
-    for ( i = 0; i < limit; i++ )
-        errors += check_pte(v,
-                            (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */
-                            (l1_pgentry_t*)(&spl2e[i]),
-                            NULL,
-                            2, i, 0);
-
-    unmap_domain_page(spl2e);
-    unmap_domain_page(gpl2e);
-
-#if 1
-    if ( errors )
-        printk("check_l2_table returning %d errors\n", errors);
-#endif
-
-    return errors;
-}
-#undef FAILPT
-
-int _check_pagetable(struct vcpu *v, char *s)
-{
-    struct domain *d = v->domain;
-    pagetable_t pt = v->arch.guest_table;
-    unsigned long gptbase = pagetable_get_paddr(pt);
-    unsigned long ptbase_pfn, smfn;
-    unsigned long i;
-    l2_pgentry_t *gpl2e, *spl2e;
-    unsigned long ptbase_mfn = 0;
-    int errors = 0, limit, oos_pdes = 0;
-
-    //_audit_domain(d, AUDIT_QUIET);
-    shadow_lock(d);
-
-    sh_check_name = s;
-    //SH_VVLOG("%s-PT Audit", s);
-    sh_l2_present = sh_l1_present = 0;
-    perfc_incrc(check_pagetable);
-
-    ptbase_mfn = gptbase >> PAGE_SHIFT;
-    ptbase_pfn = mfn_to_gmfn(d, ptbase_mfn);
-
-    if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) )
-    {
-        printk("%s-PT %lx not shadowed\n", s, gptbase);
-        goto out;
-    }
-    if ( page_out_of_sync(mfn_to_page(ptbase_mfn)) )
-    {
-        ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot);
-        oos_pdes = 1;
-        ASSERT(ptbase_mfn);
-    }
-    errors += check_l2_table(v, ptbase_mfn, smfn, oos_pdes);
-
-    gpl2e = (l2_pgentry_t *) map_domain_page(ptbase_mfn);
-    spl2e = (l2_pgentry_t *) map_domain_page(smfn);
-
-    /* Go back and recurse. */
-    if ( shadow_mode_external(d) )
-        limit = L2_PAGETABLE_ENTRIES;
-    else
-        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-
-    for ( i = 0; i < limit; i++ )
-    {
-        unsigned long gl1pfn = l2e_get_pfn(gpl2e[i]);
-        unsigned long gl1mfn = gmfn_to_mfn(d, gl1pfn);
-        unsigned long sl1mfn = l2e_get_pfn(spl2e[i]);
-
-        if ( l2e_get_intpte(spl2e[i]) != 0 )  /* FIXME: check flags? */
-        {
-            errors += check_l1_table(v, gl1pfn, gl1mfn, sl1mfn, i);
-        }
-    }
-
-    unmap_domain_page(spl2e);
-    unmap_domain_page(gpl2e);
-
- out:
-    if ( errors )
-        BUG();
-
-    shadow_unlock(d);
-
-    return errors;
-}
-
-int _check_all_pagetables(struct vcpu *v, char *s)
-{
-    struct domain *d = v->domain;
-    int i;
-    struct shadow_status *a;
-    unsigned long gmfn;
-    int errors = 0;
-
-    shadow_status_noswap = 1;
-
-    sh_check_name = s;
-    SH_VVLOG("%s-PT Audit domid=%d", s, d->domain_id);
-    sh_l2_present = sh_l1_present = 0;
-    perfc_incrc(check_all_pagetables);
-
-    for (i = 0; i < shadow_ht_buckets; i++)
-    {
-        a = &d->arch.shadow_ht[i];
-        while ( a && a->gpfn_and_flags )
-        {
-            gmfn = gmfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask);
-
-            switch ( a->gpfn_and_flags & PGT_type_mask )
-            {
-            case PGT_l1_shadow:
-                errors += check_l1_table(v, a->gpfn_and_flags & PGT_mfn_mask,
-                                         gmfn, a->smfn, 0);
-                break;
-            case PGT_l2_shadow:
-                errors += check_l2_table(v, gmfn, a->smfn,
-                                         page_out_of_sync(mfn_to_page(gmfn)));
-                break;
-            case PGT_l3_shadow:
-            case PGT_l4_shadow:
-            case PGT_hl2_shadow:
-                BUG(); // XXX - ought to fix this...
-                break;
-            case PGT_snapshot:
-            case PGT_writable_pred:
-                break;
-            default:
-                errors++;
-                printk("unexpected shadow type %lx, gpfn=%lx, "
-                       "gmfn=%lx smfn=%lx\n",
-                       a->gpfn_and_flags & PGT_type_mask,
-                       a->gpfn_and_flags & PGT_mfn_mask,
-                       gmfn, a->smfn);
-                BUG();
-            }
-            a = a->next;
-        }
-    }
-
-    shadow_status_noswap = 0;
-
-    if ( errors )
-        BUG();
-
-    return errors;
-}
-
-#endif // SHADOW_DEBUG
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/arch/x86/shadow_guest32.c b/xen/arch/x86/shadow_guest32.c
deleted file mode 100644 (file)
index bdc5825..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-#define GUEST_PGENTRY_32
-
-#include "shadow.c"
-struct shadow_ops MODE_64_2_HANDLER = {
-    .guest_paging_levels        = 2,
-    .invlpg                     = shadow_invlpg_64,
-    .fault                      = shadow_fault_64,
-    .update_pagetables          = shadow_update_pagetables,
-    .sync_all                   = sync_all,
-    .remove_all_write_access    = remove_all_write_access,
-    .do_update_va_mapping       = do_update_va_mapping,
-    .mark_mfn_out_of_sync       = mark_mfn_out_of_sync,
-    .is_out_of_sync             = is_out_of_sync,
-    .gva_to_gpa                 = gva_to_gpa_64,
-};
-
diff --git a/xen/arch/x86/shadow_guest32pae.c b/xen/arch/x86/shadow_guest32pae.c
deleted file mode 100644 (file)
index 432c9b9..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-#define GUEST_32PAE
-
-#include "shadow.c"
-struct shadow_ops MODE_64_PAE_HANDLER = {
-    .guest_paging_levels              = 3,
-    .invlpg                     = shadow_invlpg_64,
-    .fault                      = shadow_fault_64,
-    .update_pagetables          = shadow_update_pagetables,
-    .sync_all                   = sync_all,
-    .remove_all_write_access    = remove_all_write_access,
-    .do_update_va_mapping       = do_update_va_mapping,
-    .mark_mfn_out_of_sync       = mark_mfn_out_of_sync,
-    .is_out_of_sync             = is_out_of_sync,
-    .gva_to_gpa                 = gva_to_gpa_64,
-};
-
diff --git a/xen/arch/x86/shadow_public.c b/xen/arch/x86/shadow_public.c
deleted file mode 100644 (file)
index 40aa22e..0000000
+++ /dev/null
@@ -1,2143 +0,0 @@
-/******************************************************************************
- * arch/x86/shadow_public.c
- * 
- * Copyright (c) 2005 Michael A Fetterman
- * Based on an earlier implementation by Ian Pratt et al
- * 
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/mm.h>
-#include <xen/domain_page.h>
-#include <asm/shadow.h>
-#include <asm/page.h>
-#include <xen/event.h>
-#include <xen/sched.h>
-#include <xen/trace.h>
-#include <xen/guest_access.h>
-#include <asm/shadow_64.h>
-
-static int alloc_p2m_table(struct domain *d);
-static void free_p2m_table(struct domain *d);
-
-#define SHADOW_MAX_GUEST32(_encoded) ((L1_PAGETABLE_ENTRIES_32 - 1) - ((_encoded) >> 16))
-
-
-int shadow_direct_map_init(struct domain *d)
-{
-    struct page_info *page;
-    l3_pgentry_t *root;
-
-    if ( !(page = alloc_domheap_pages(NULL, 0, MEMF_dma)) )
-        return 0;
-
-    root = map_domain_page(page_to_mfn(page));
-    memset(root, 0, PAGE_SIZE);
-    root[PAE_SHADOW_SELF_ENTRY] = l3e_from_page(page, __PAGE_HYPERVISOR);
-
-    d->arch.phys_table = pagetable_from_page(page);
-
-    unmap_domain_page(root);
-    return 1;
-}
-
-void shadow_direct_map_clean(struct domain *d)
-{
-    unsigned long mfn;
-    l2_pgentry_t *l2e;
-    l3_pgentry_t *l3e;
-    int i, j;
-
-    mfn = pagetable_get_pfn(d->arch.phys_table);
-
-    /*
-     * We may fail very early before direct map is built.
-     */
-    if ( !mfn )
-        return;
-
-    l3e = (l3_pgentry_t *)map_domain_page(mfn);
-
-    for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ )
-    {
-        if ( l3e_get_flags(l3e[i]) & _PAGE_PRESENT )
-        {
-            l2e = map_domain_page(l3e_get_pfn(l3e[i]));
-
-            for ( j = 0; j < L2_PAGETABLE_ENTRIES; j++ )
-            {
-                if ( l2e_get_flags(l2e[j]) & _PAGE_PRESENT )
-                    free_domheap_page(mfn_to_page(l2e_get_pfn(l2e[j])));
-            }
-            unmap_domain_page(l2e);
-            free_domheap_page(mfn_to_page(l3e_get_pfn(l3e[i])));
-        }
-    }
-    free_domheap_page(mfn_to_page(mfn));
-
-    unmap_domain_page(l3e);
-
-    d->arch.phys_table = pagetable_null();
-}
-
-/****************************************************************************/
-/************* export interface functions ***********************************/
-/****************************************************************************/
-void free_shadow_pages(struct domain *d);
-
-int shadow_set_guest_paging_levels(struct domain *d, int levels)
-{
-    struct vcpu *v = current;
-
-    /*
-     * Need to wait for VCPU0 to complete the on-going shadow ops.
-     */
-
-    if ( v->domain == d && v->vcpu_id )
-        return 1;
-
-    shadow_lock(d);
-
-    switch(levels) {
-#if CONFIG_PAGING_LEVELS == 4
-    case 4:
-        if ( d->arch.ops != &MODE_64_4_HANDLER )
-            d->arch.ops = &MODE_64_4_HANDLER;
-        shadow_unlock(d);
-        return 1;
-#endif
-#if CONFIG_PAGING_LEVELS == 3
-    case 3:
-        if ( d->arch.ops == NULL ||
-                    shadow_mode_log_dirty(d) )
-        {
-            if ( d->arch.ops != &MODE_64_3_HANDLER )
-                d->arch.ops = &MODE_64_3_HANDLER;
-        }
-        else
-        {
-            if ( d->arch.ops == &MODE_64_2_HANDLER )
-                free_shadow_pages(d);
-            if ( d->arch.ops != &MODE_64_PAE_HANDLER )
-                d->arch.ops = &MODE_64_PAE_HANDLER;
-        }
-        shadow_unlock(d);
-        return 1;
-#endif
-#if CONFIG_PAGING_LEVELS == 4
-    case 3:
-        if ( d->arch.ops == &MODE_64_2_HANDLER )
-            free_shadow_pages(d);
-        if ( d->arch.ops != &MODE_64_PAE_HANDLER )
-            d->arch.ops = &MODE_64_PAE_HANDLER;
-        shadow_unlock(d);
-        return 1;
-#endif
-    case 2:
-#if CONFIG_PAGING_LEVELS == 2
-        if ( d->arch.ops != &MODE_32_2_HANDLER )
-            d->arch.ops = &MODE_32_2_HANDLER;
-#elif CONFIG_PAGING_LEVELS >= 3
-        if ( d->arch.ops != &MODE_64_2_HANDLER )
-            d->arch.ops = &MODE_64_2_HANDLER;
-#endif
-        shadow_unlock(d);
-        return 1;
-    default:
-        shadow_unlock(d);
-        return 0;
-    }
-}
-
-void shadow_invlpg(struct vcpu *v, unsigned long va)
-{
-    struct domain *d = current->domain;
-    d->arch.ops->invlpg(v, va);
-}
-
-int shadow_fault(unsigned long va, struct cpu_user_regs *regs)
-{
-    struct domain *d = current->domain;
-    return d->arch.ops->fault(va, regs);
-}
-
-void __update_pagetables(struct vcpu *v)
-{
-    struct domain *d = v->domain;
-    d->arch.ops->update_pagetables(v);
-}
-
-void __shadow_sync_all(struct domain *d)
-{
-    d->arch.ops->sync_all(d);
-}
-    
-int shadow_remove_all_write_access(
-    struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn)
-{
-    return d->arch.ops->remove_all_write_access(d, readonly_gpfn, readonly_gmfn);
-}
-
-int shadow_do_update_va_mapping(unsigned long va,
-                                l1_pgentry_t val,
-                                struct vcpu *v)
-{
-    struct domain *d = v->domain;
-    return d->arch.ops->do_update_va_mapping(va, val, v);
-}
-
-struct out_of_sync_entry *
-shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn,
-                            unsigned long mfn)
-{
-    struct domain *d = v->domain;
-    return d->arch.ops->mark_mfn_out_of_sync(v, gpfn, mfn);
-}
-
-/*
- * Returns 1 if va's shadow mapping is out-of-sync.
- * Returns 0 otherwise.
- */
-int __shadow_out_of_sync(struct vcpu *v, unsigned long va)
-{
-    struct domain *d = v->domain;
-    return d->arch.ops->is_out_of_sync(v, va);
-}
-
-unsigned long gva_to_gpa(unsigned long gva)
-{
-    struct domain *d = current->domain;
-    return d->arch.ops->gva_to_gpa(gva);
-}
-/****************************************************************************/
-/****************************************************************************/
-#if CONFIG_PAGING_LEVELS >= 3
-
-static void inline
-free_shadow_fl1_table(struct domain *d, unsigned long smfn)
-{
-    l1_pgentry_t *pl1e = map_domain_page(smfn);
-    int i;
-
-    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
-        put_page_from_l1e(pl1e[i], d);
-
-    unmap_domain_page(pl1e);
-}
-
-/*
- * Free l2, l3, l4 shadow tables
- */
-
-void free_fake_shadow_l2(struct domain *d,unsigned long smfn);
-
-static void inline
-free_shadow_tables(struct domain *d, unsigned long smfn, u32 level)
-{
-    pgentry_64_t *ple = map_domain_page(smfn);
-    int i, external = shadow_mode_external(d);
-
-#if CONFIG_PAGING_LEVELS >= 3
-    if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
-    {
-        struct page_info *page = mfn_to_page(smfn);
-        for ( i = 0; i < PAE_L3_PAGETABLE_ENTRIES; i++ )
-        {
-            if ( entry_get_flags(ple[i]) & _PAGE_PRESENT )
-                free_fake_shadow_l2(d, entry_get_pfn(ple[i]));
-        }
-
-        page = mfn_to_page(entry_get_pfn(ple[0]));
-        free_domheap_pages(page, SL2_ORDER);
-        unmap_domain_page(ple);
-    }
-    else
-#endif
-    {
-        /*
-         * No Xen mappings in external pages
-         */
-        if ( external )
-        {
-            for ( i = 0; i < PAGETABLE_ENTRIES; i++ ) {
-                if ( entry_get_flags(ple[i]) & _PAGE_PRESENT )
-                    put_shadow_ref(entry_get_pfn(ple[i]));
-                if (d->arch.ops->guest_paging_levels == PAGING_L3)
-                {
-#if CONFIG_PAGING_LEVELS >= 3
-                    if ( i == PAE_L3_PAGETABLE_ENTRIES && level == PAGING_L4 )
-#endif
-                        break;
-                }
-            }
-        } 
-        else
-        {
-            for ( i = 0; i < PAGETABLE_ENTRIES; i++ )
-            {
-                /* 
-                 * List the skip/break conditions to avoid freeing
-                 * Xen private mappings.
-                 */
-#if CONFIG_PAGING_LEVELS == 2
-                if ( level == PAGING_L2 && !is_guest_l2_slot(0, i) )
-                    continue;
-#endif
-#if CONFIG_PAGING_LEVELS == 3
-                if ( level == PAGING_L3 && i == L3_PAGETABLE_ENTRIES )
-                    break;
-                if ( level == PAGING_L2 )
-                {
-                    struct page_info *page = mfn_to_page(smfn);
-                    if ( is_xen_l2_slot(page->u.inuse.type_info, i) )
-                        continue;
-                }
-#endif
-#if CONFIG_PAGING_LEVELS == 4
-                if ( level == PAGING_L4 && !is_guest_l4_slot(i))
-                    continue;
-#endif
-                if ( entry_get_flags(ple[i]) & _PAGE_PRESENT )
-                    put_shadow_ref(entry_get_pfn(ple[i]));
-            }
-        }
-        unmap_domain_page(ple);
-    }
-}
-#endif
-
-#if CONFIG_PAGING_LEVELS == 4
-static void alloc_monitor_pagetable(struct vcpu *v)
-{
-    unsigned long mmfn;
-    l4_pgentry_t *mpl4e;
-    struct page_info *mmfn_info;
-    struct domain *d = v->domain;
-
-    ASSERT(!pagetable_get_paddr(v->arch.monitor_table)); /* we should only get called once */
-
-    mmfn_info = alloc_domheap_page(NULL);
-    ASSERT( mmfn_info );
-    if (!mmfn_info)
-    {
-        printk("Fail to allocate monitor pagetable\n");
-        domain_crash(v->domain);
-    }
-
-    mmfn = page_to_mfn(mmfn_info);
-    mpl4e = (l4_pgentry_t *) map_domain_page_global(mmfn);
-    memcpy(mpl4e, idle_pg_table, PAGE_SIZE);
-    mpl4e[l4_table_offset(PERDOMAIN_VIRT_START)] =
-        l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR);
-
-    /* map the phys_to_machine map into the per domain Read-Only MPT space */
-
-    v->arch.monitor_table = pagetable_from_pfn(mmfn);
-    v->arch.monitor_vtable = (l2_pgentry_t *) mpl4e;
-    mpl4e[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty();
-
-    if ( v->vcpu_id == 0 )
-        alloc_p2m_table(d);
-    else
-    {
-        unsigned long mfn;
-
-        mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
-        if ( mfn )
-        {
-            l4_pgentry_t *l4tab;
-
-            l4tab = map_domain_page(mfn);
-
-            mpl4e[l4_table_offset(RO_MPT_VIRT_START)] =
-                l4tab[l4_table_offset(RO_MPT_VIRT_START)];
-
-            unmap_domain_page(l4tab);
-        }
-    }
-}
-
-void free_monitor_pagetable(struct vcpu *v)
-{
-    unsigned long mfn;
-
-    /*
-     * free monitor_table.
-     */
-    if ( v->vcpu_id == 0 )
-        free_p2m_table(v->domain);
-
-    /*
-     * Then free monitor_table.
-     */
-    mfn = pagetable_get_pfn(v->arch.monitor_table);
-    unmap_domain_page_global(v->arch.monitor_vtable);
-    free_domheap_page(mfn_to_page(mfn));
-
-    v->arch.monitor_table = pagetable_null();
-    v->arch.monitor_vtable = 0;
-}
-#elif CONFIG_PAGING_LEVELS == 3
-static void alloc_monitor_pagetable(struct vcpu *v)
-{
-    unsigned long m2mfn, m3mfn;
-    l2_pgentry_t *mpl2e;
-    l3_pgentry_t *mpl3e;
-    struct page_info *m2mfn_info, *m3mfn_info;
-    struct domain *d = v->domain;
-    int i;
-
-    ASSERT(!pagetable_get_paddr(v->arch.monitor_table)); /* we should only get called once */
-
-    m3mfn_info = alloc_domheap_pages(NULL, 0, MEMF_dma);
-    ASSERT( m3mfn_info );
-
-    m3mfn = page_to_mfn(m3mfn_info);
-    mpl3e = (l3_pgentry_t *) map_domain_page_global(m3mfn);
-    memset(mpl3e, 0, L3_PAGETABLE_ENTRIES * sizeof(l3_pgentry_t));
-
-    v->arch.monitor_table = pagetable_from_pfn(m3mfn);
-    v->arch.monitor_vtable = (l2_pgentry_t *) mpl3e;
-
-    m2mfn_info = alloc_domheap_page(NULL);
-    ASSERT( m2mfn_info );
-
-    m2mfn = page_to_mfn(m2mfn_info);
-    mpl2e = (l2_pgentry_t *) map_domain_page(m2mfn);
-    memset(mpl2e, 0, PAGE_SIZE);
-
-    /* Map L2 page into L3 */
-    mpl3e[L3_PAGETABLE_ENTRIES - 1] = l3e_from_pfn(m2mfn, _PAGE_PRESENT);
-
-    memcpy(&mpl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)],
-           &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT],
-           L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t));
-
-    for ( i = 0; i < PDPT_L2_ENTRIES; i++ )
-        mpl2e[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
-            l2e_from_page(
-                virt_to_page(d->arch.mm_perdomain_pt) + i,
-                __PAGE_HYPERVISOR);
-    for ( i = 0; i < (LINEARPT_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
-        mpl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] =
-            (l3e_get_flags(mpl3e[i]) & _PAGE_PRESENT) ?
-            l2e_from_pfn(l3e_get_pfn(mpl3e[i]), __PAGE_HYPERVISOR) :
-            l2e_empty();
-    for ( i = 0; i < (MACHPHYS_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
-        mpl2e[l2_table_offset(RO_MPT_VIRT_START) + i] = l2e_empty();
-
-    if ( v->vcpu_id == 0 )
-    {
-        unsigned long m1mfn;
-        l1_pgentry_t *mpl1e;
-        struct page_info *m1mfn_info;
-
-        /*
-         * 2 l2 slots are allocated here, so that 4M for p2m table,
-         * with this we can guarantee PCI MMIO p2m entries, especially
-         * Cirrus VGA, can be seen by all other vcpus.
-         */
-        for ( i = 0; i < 2; i++ )
-        {
-            m1mfn_info = alloc_domheap_page(NULL);
-            ASSERT( m1mfn_info );
-
-            m1mfn = page_to_mfn(m1mfn_info);
-            mpl1e = (l1_pgentry_t *) map_domain_page(m1mfn);
-            memset(mpl1e, 0, PAGE_SIZE);
-            unmap_domain_page(mpl1e);
-
-            /* Map L1 page into L2 */
-            mpl2e[l2_table_offset(RO_MPT_VIRT_START) + i] =
-                l2e_from_pfn(m1mfn, __PAGE_HYPERVISOR);
-        }
-
-        alloc_p2m_table(d);
-    }
-    else
-    {
-        unsigned long mfn;
-
-        mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
-        if ( mfn )
-        {
-            l3_pgentry_t *l3tab, l3e;
-            l2_pgentry_t *l2tab;
-
-            l3tab = map_domain_page(mfn);
-            l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)];
-
-            /*
-             * NB: when CONFIG_PAGING_LEVELS == 3,
-             * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
-             * alloc_monitor_pagetable should guarantee this.
-             */
-            if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
-                BUG();
-
-            l2tab = map_domain_page(l3e_get_pfn(l3e));
-
-            for ( i = 0; i < (MACHPHYS_MBYTES >> (L2_PAGETABLE_SHIFT - 20)); i++ )
-                mpl2e[l2_table_offset(RO_MPT_VIRT_START) + i] =
-                    l2tab[l2_table_offset(RO_MPT_VIRT_START) + i];
-
-            unmap_domain_page(l2tab);
-            unmap_domain_page(l3tab);
-        }
-    }
-
-    unmap_domain_page(mpl2e);
-}
-
-void free_monitor_pagetable(struct vcpu *v)
-{
-    unsigned long m2mfn, m3mfn;
-    /*
-     * free monitor_table.
-     */
-    if ( v->vcpu_id == 0 )
-        free_p2m_table(v->domain);
-
-    m3mfn = pagetable_get_pfn(v->arch.monitor_table);
-    m2mfn = l2e_get_pfn(v->arch.monitor_vtable[L3_PAGETABLE_ENTRIES - 1]);
-
-    free_domheap_page(mfn_to_page(m2mfn));
-    unmap_domain_page_global(v->arch.monitor_vtable);
-    free_domheap_page(mfn_to_page(m3mfn));
-
-    v->arch.monitor_table = pagetable_null();
-    v->arch.monitor_vtable = 0;
-}
-#endif
-
-static void
-shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry)
-{
-    void *snapshot;
-
-    if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE )
-        return;
-
-    // Clear the out_of_sync bit.
-    //
-    clear_bit(_PGC_out_of_sync, &mfn_to_page(entry->gmfn)->count_info);
-
-    // XXX Need to think about how to protect the domain's
-    // information less expensively.
-    //
-    snapshot = map_domain_page(entry->snapshot_mfn);
-    memset(snapshot, 0, PAGE_SIZE);
-    unmap_domain_page(snapshot);
-
-    put_shadow_ref(entry->snapshot_mfn);
-}
-
-void
-release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry)
-{
-    struct page_info *page;
-
-    page = mfn_to_page(entry->gmfn);
-        
-    // Decrement ref count of guest & shadow pages
-    //
-    put_page(page);
-
-    // Only use entries that have low bits clear...
-    //
-    if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) )
-    {
-        put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT);
-        entry->writable_pl1e = -2;
-    }
-    else
-        ASSERT( entry->writable_pl1e == -1 );
-
-    // Free the snapshot
-    //
-    shadow_free_snapshot(d, entry);
-}
-
-static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn)
-{
-    struct out_of_sync_entry *entry = d->arch.out_of_sync;
-    struct out_of_sync_entry **prev = &d->arch.out_of_sync;
-    struct out_of_sync_entry *found = NULL;
-
-    // NB: Be careful not to call something that manipulates this list
-    //     while walking it.  Collect the results into a separate list
-    //     first, then walk that list.
-    //
-    while ( entry )
-    {
-        if ( entry->gmfn == gmfn )
-        {
-            // remove from out of sync list
-            *prev = entry->next;
-
-            // add to found list
-            entry->next = found;
-            found = entry;
-
-            entry = *prev;
-            continue;
-        }
-        prev = &entry->next;
-        entry = entry->next;
-    }
-
-    prev = NULL;
-    entry = found;
-    while ( entry )
-    {
-        release_out_of_sync_entry(d, entry);
-
-        prev = &entry->next;
-        entry = entry->next;
-    }
-
-    // Add found list to free list
-    if ( prev )
-    {
-        *prev = d->arch.out_of_sync_free;
-        d->arch.out_of_sync_free = found;
-    }
-}
-
-static inline void
-shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn)
-{
-    if ( !shadow_mode_refcounts(d) )
-        return;
-
-    ASSERT(mfn_to_page(gmfn)->count_info & PGC_page_table);
-
-    if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none )
-    {
-        clear_bit(_PGC_page_table, &mfn_to_page(gmfn)->count_info);
-
-        if ( page_out_of_sync(mfn_to_page(gmfn)) )
-        {
-            remove_out_of_sync_entries(d, gmfn);
-        }
-    }
-}
-
-static void inline
-free_shadow_l1_table(struct domain *d, unsigned long smfn)
-{
-    l1_pgentry_t *pl1e = map_domain_page(smfn);
-    l1_pgentry_t *pl1e_next = 0, *sl1e_p;
-    int i;
-    struct page_info *spage = mfn_to_page(smfn);
-    u32 min_max = spage->tlbflush_timestamp;
-    int min = SHADOW_MIN(min_max);
-    int max;
-    
-    if ( d->arch.ops->guest_paging_levels == PAGING_L2 )
-    {
-        max = SHADOW_MAX_GUEST32(min_max);
-        pl1e_next = map_domain_page(smfn + 1);
-    }
-    else
-        max = SHADOW_MAX(min_max);
-
-    for ( i = min; i <= max; i++ )
-    {
-        if ( pl1e_next && i >= L1_PAGETABLE_ENTRIES )
-            sl1e_p = &pl1e_next[i - L1_PAGETABLE_ENTRIES];
-        else
-            sl1e_p = &pl1e[i];
-
-        shadow_put_page_from_l1e(*sl1e_p, d);
-        *sl1e_p = l1e_empty();
-    }
-
-    unmap_domain_page(pl1e);
-    if ( pl1e_next )
-        unmap_domain_page(pl1e_next);
-}
-
-static void inline
-free_shadow_hl2_table(struct domain *d, unsigned long smfn)
-{
-    l1_pgentry_t *hl2 = map_domain_page(smfn);
-    int i, limit;
-
-    SH_VVLOG("%s: smfn=%lx freed", __func__, smfn);
-
-#if CONFIG_PAGING_LEVELS == 2
-    if ( shadow_mode_external(d) )
-        limit = L2_PAGETABLE_ENTRIES;
-    else
-        limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE;
-#endif
-
-    for ( i = 0; i < limit; i++ )
-    {
-        if ( l1e_get_flags(hl2[i]) & _PAGE_PRESENT )
-            put_page(mfn_to_page(l1e_get_pfn(hl2[i])));
-    }
-
-    unmap_domain_page(hl2);
-}
-
-static void inline
-free_shadow_l2_table(struct domain *d, unsigned long smfn, unsigned int type)
-{
-    l2_pgentry_t *pl2e = map_domain_page(smfn);
-    int i, external = shadow_mode_external(d);
-
-    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
-        if ( external || is_guest_l2_slot(type, i) )
-            if ( l2e_get_flags(pl2e[i]) & _PAGE_PRESENT )
-                put_shadow_ref(l2e_get_pfn(pl2e[i]));
-
-    if ( (PGT_base_page_table == PGT_l2_page_table) &&
-         shadow_mode_translate(d) && !external )
-    {
-        // free the ref to the hl2
-        //
-        put_shadow_ref(l2e_get_pfn(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)]));
-    }
-
-    unmap_domain_page(pl2e);
-}
-
-void free_fake_shadow_l2(struct domain *d, unsigned long smfn)
-{
-    pgentry_64_t *ple = map_domain_page(smfn);
-    int i;
-
-    for ( i = 0; i < PAGETABLE_ENTRIES; i = i + 2 )
-        if ( entry_get_flags(ple[i]) & _PAGE_PRESENT )
-            put_shadow_ref(entry_get_pfn(ple[i]));
-
-    unmap_domain_page(ple);
-}
-
-void free_shadow_page(unsigned long smfn)
-{
-    struct page_info *page = mfn_to_page(smfn);
-    unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask;
-    struct domain *d = page_get_owner(mfn_to_page(gmfn));
-    unsigned long gpfn = mfn_to_gmfn(d, gmfn);
-    unsigned long type = page->u.inuse.type_info & PGT_type_mask;
-    u64 index = 0;
-
-    SH_VVLOG("%s: free'ing smfn=%lx", __func__, smfn);
-
-    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
-#if CONFIG_PAGING_LEVELS >= 4
-    if ( type == PGT_fl1_shadow ) 
-    {
-        unsigned long mfn;
-        mfn = __shadow_status(d, gpfn, PGT_fl1_shadow);
-        if ( !mfn )
-            gpfn |= PGT_high_mfn_nx;
-    }
-#endif
-#if CONFIG_PAGING_LEVELS >= 3
-    if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
-    {
-        if ( type == PGT_l4_shadow )
-            index = page->tlbflush_timestamp;
-    }
-#endif
-
-    delete_shadow_status(d, gpfn, gmfn, type, index);
-
-    switch ( type )
-    {
-    case PGT_l1_shadow:
-        perfc_decr(shadow_l1_pages);
-        shadow_demote(d, gpfn, gmfn);
-        free_shadow_l1_table(d, smfn);
-        d->arch.shadow_page_count--;
-        break;
-#if CONFIG_PAGING_LEVELS == 2
-    case PGT_l2_shadow:
-        perfc_decr(shadow_l2_pages);
-        shadow_demote(d, gpfn, gmfn);
-        free_shadow_l2_table(d, smfn, page->u.inuse.type_info);
-        d->arch.shadow_page_count--;
-        break;
-
-    case PGT_hl2_shadow:
-        perfc_decr(hl2_table_pages);
-        shadow_demote(d, gpfn, gmfn);
-        free_shadow_hl2_table(d, smfn);
-        d->arch.hl2_page_count--;
-        break;
-#endif
-#if CONFIG_PAGING_LEVELS >= 3
-    case PGT_l2_shadow:
-    case PGT_l3_shadow:
-        shadow_demote(d, gpfn, gmfn);
-        free_shadow_tables(d, smfn, shadow_type_to_level(type));
-        d->arch.shadow_page_count--;
-        break;
-
-    case PGT_l4_shadow:
-        gpfn = gpfn & PGT_mfn_mask;
-        if ( d->arch.ops->guest_paging_levels == PAGING_L3 )
-        {
-            /*
-             * Since a single PDPT page can have multiple PDPs, it's possible
-             * that shadow_demote() has been already called for gmfn.
-             */
-            if ( mfn_is_page_table(gmfn) )
-                shadow_demote(d, gpfn, gmfn);
-        } else
-            shadow_demote(d, gpfn, gmfn);
-
-        free_shadow_tables(d, smfn, shadow_type_to_level(type));
-        d->arch.shadow_page_count--;
-        break;
-
-    case PGT_fl1_shadow:
-        free_shadow_fl1_table(d, smfn);
-        d->arch.shadow_page_count--;
-        break;
-#endif
-    case PGT_snapshot:
-        perfc_decr(snapshot_pages);
-        break;
-
-    default:
-        printk("Free shadow weird page type mfn=%lx type=%" PRtype_info "\n",
-               page_to_mfn(page), page->u.inuse.type_info);
-        break;
-    }
-
-    // No TLB flushes are needed the next time this page gets allocated.
-    //
-    page->tlbflush_timestamp = 0;
-    page->u.free.cpumask     = CPU_MASK_NONE;
-
-    if ( type == PGT_l1_shadow )
-    {
-        list_add(&page->list, &d->arch.free_shadow_frames);
-        perfc_incr(free_l1_pages);
-    }
-    else
-        free_domheap_page(page);
-}
-
-static void
-free_writable_pte_predictions(struct domain *d)
-{
-    int i;
-    struct shadow_status *x;
-
-    for ( i = 0; i < shadow_ht_buckets; i++ )
-    {
-        u32 count;
-        unsigned long *gpfn_list;
-
-        /* Skip empty buckets. */
-        if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
-            continue;
-
-        count = 0;
-        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
-            if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
-                count++;
-
-        gpfn_list = xmalloc_array(unsigned long, count);
-        count = 0;
-        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
-            if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred )
-                gpfn_list[count++] = x->gpfn_and_flags & PGT_mfn_mask;
-
-        while ( count )
-        {
-            count--;
-            delete_shadow_status(d, gpfn_list[count], 0, PGT_writable_pred, 0);
-        }
-
-        xfree(gpfn_list);
-    }
-}
-
-static void free_shadow_ht_entries(struct domain *d)
-{
-    struct shadow_status *x, *n;
-
-    SH_VLOG("freed tables count=%d l1=%d l2=%d",
-            d->arch.shadow_page_count, perfc_value(shadow_l1_pages), 
-            perfc_value(shadow_l2_pages));
-
-    n = d->arch.shadow_ht_extras;
-    while ( (x = n) != NULL )
-    {
-        d->arch.shadow_extras_count--;
-        n = *((struct shadow_status **)(&x[shadow_ht_extra_size]));
-        xfree(x);
-    }
-
-    d->arch.shadow_ht_extras = NULL;
-    d->arch.shadow_ht_free = NULL;
-
-    ASSERT(d->arch.shadow_extras_count == 0);
-    SH_LOG("freed extras, now %d", d->arch.shadow_extras_count);
-
-    if ( d->arch.shadow_dirty_bitmap != NULL )
-    {
-        xfree(d->arch.shadow_dirty_bitmap);
-        d->arch.shadow_dirty_bitmap = 0;
-        d->arch.shadow_dirty_bitmap_size = 0;
-    }
-
-    xfree(d->arch.shadow_ht);
-    d->arch.shadow_ht = NULL;
-}
-
-static void free_out_of_sync_entries(struct domain *d)
-{
-    struct out_of_sync_entry *x, *n;
-
-    n = d->arch.out_of_sync_extras;
-    while ( (x = n) != NULL )
-    {
-        d->arch.out_of_sync_extras_count--;
-        n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size]));
-        xfree(x);
-    }
-
-    d->arch.out_of_sync_extras = NULL;
-    d->arch.out_of_sync_free = NULL;
-    d->arch.out_of_sync = NULL;
-
-    ASSERT(d->arch.out_of_sync_extras_count == 0);
-    FSH_LOG("freed extra out_of_sync entries, now %d",
-            d->arch.out_of_sync_extras_count);
-}
-
-void free_shadow_pages(struct domain *d)
-{
-    int                   i;
-    struct shadow_status *x;
-    struct vcpu          *v;
-    struct list_head *list_ent, *tmp;
-
-    /*
-     * WARNING! The shadow page table must not currently be in use!
-     * e.g., You are expected to have paused the domain and synchronized CR3.
-     */
-
-    if( !d->arch.shadow_ht ) return;
-
-    shadow_audit(d, 1);
-
-    // first, remove any outstanding refs from out_of_sync entries...
-    //
-    free_out_of_sync_state(d);
-
-    // second, remove any outstanding refs from v->arch.shadow_table
-    // and CR3.
-    //
-    for_each_vcpu(d, v)
-    {
-        if ( pagetable_get_paddr(v->arch.shadow_table) )
-        {
-            put_shadow_ref(pagetable_get_pfn(v->arch.shadow_table));
-            v->arch.shadow_table = pagetable_null();
-
-            if ( shadow_mode_external(d) )
-            {
-                if ( v->arch.shadow_vtable )
-                    unmap_domain_page_global(v->arch.shadow_vtable);
-                v->arch.shadow_vtable = NULL;
-            }
-        }
-
-        if ( v->arch.monitor_shadow_ref )
-        {
-            put_shadow_ref(v->arch.monitor_shadow_ref);
-            v->arch.monitor_shadow_ref = 0;
-        }
-    }
-
-#if CONFIG_PAGING_LEVELS == 2
-    // For external shadows, remove the monitor table's refs
-    //
-    if ( shadow_mode_external(d) )
-    {
-        for_each_vcpu(d, v)
-        {
-            l2_pgentry_t *mpl2e = v->arch.monitor_vtable;
-
-            if ( mpl2e )
-            {
-                l2_pgentry_t hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)];
-                l2_pgentry_t smfn = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)];
-
-                if ( l2e_get_flags(hl2e) & _PAGE_PRESENT )
-                {
-                    put_shadow_ref(l2e_get_pfn(hl2e));
-                    mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty();
-                }
-                if ( l2e_get_flags(smfn) & _PAGE_PRESENT )
-                {
-                    put_shadow_ref(l2e_get_pfn(smfn));
-                    mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty();
-                }
-            }
-        }
-    }
-#endif
-    // Now, the only refs to shadow pages that are left are from the shadow
-    // pages themselves.  We just unpin the pinned pages, and the rest
-    // should automatically disappear.
-    //
-    // NB: Beware: each explicitly or implicit call to free_shadow_page
-    // can/will result in the hash bucket getting rewritten out from
-    // under us...  First, collect the list of pinned pages, then
-    // free them.
-    //
-    for ( i = 0; i < shadow_ht_buckets; i++ )
-    {
-        u32 count;
-        unsigned long *mfn_list;
-
-        /* Skip empty buckets. */
-        if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 )
-            continue;
-
-        count = 0;
-        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
-            if ( MFN_PINNED(x->smfn) )
-                count++;
-        if ( !count )
-            continue;
-
-        mfn_list = xmalloc_array(unsigned long, count);
-        count = 0;
-        for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next )
-            if ( MFN_PINNED(x->smfn) )
-                mfn_list[count++] = x->smfn;
-
-        while ( count )
-        {
-            shadow_unpin(mfn_list[--count]);
-        }
-        xfree(mfn_list);
-    }
-
-    /* Now free the pre-zero'ed pages from the domain. */
-    list_for_each_safe(list_ent, tmp, &d->arch.free_shadow_frames)
-    {
-        struct page_info *page = list_entry(list_ent, struct page_info, list);
-
-        list_del(list_ent);
-        perfc_decr(free_l1_pages);
-
-        if (d->arch.ops->guest_paging_levels == PAGING_L2)
-        {
-#if CONFIG_PAGING_LEVELS >=3
-            free_domheap_pages(page, SL1_ORDER);
-#else
-            free_domheap_page(page);
-#endif
-        }
-        else
-            free_domheap_page(page);
-    }
-
-    shadow_audit(d, 0);
-
-    SH_LOG("Free shadow table.");
-}
-
-void __shadow_mode_disable(struct domain *d)
-{
-    struct vcpu *v;
-#ifndef NDEBUG
-    int i;
-#endif
-
-    if ( unlikely(!shadow_mode_enabled(d)) )
-        return;
-
-    free_shadow_pages(d);
-    free_writable_pte_predictions(d);
-
-#ifndef NDEBUG
-    for ( i = 0; i < shadow_ht_buckets; i++ )
-    {
-        if ( d->arch.shadow_ht[i].gpfn_and_flags != 0 )
-        {
-            printk("%s: d->arch.shadow_ht[%x].gpfn_and_flags=%"PRIx64"\n",
-                   __FILE__, i, (u64)d->arch.shadow_ht[i].gpfn_and_flags);
-            BUG();
-        }
-    }
-#endif
-
-    d->arch.shadow_mode = 0;
-
-    free_shadow_ht_entries(d);
-    free_out_of_sync_entries(d);
-
-    for_each_vcpu(d, v)
-        update_pagetables(v);
-}
-
-
-int __shadow_mode_enable(struct domain *d, unsigned int mode)
-{
-    struct vcpu *v;
-    int new_modes = (mode & ~d->arch.shadow_mode);
-#if defined(CONFIG_PAGING_LEVELS)
-    int initial_paging_levels = 3;
-#endif
-
-    // Gotta be adding something to call this function.
-    ASSERT(new_modes);
-
-    // can't take anything away by calling this function.
-    ASSERT(!(d->arch.shadow_mode & ~mode));
-
-#if defined(CONFIG_PAGING_LEVELS)
-    if (  CONFIG_PAGING_LEVELS == 2 )
-        initial_paging_levels = CONFIG_PAGING_LEVELS;
-    if ( !shadow_set_guest_paging_levels(d,
-                                         initial_paging_levels) ) {
-        printk("Unsupported guest paging levels\n");
-        domain_crash_synchronous(); /* need to take a clean path */
-    }
-#endif
-
-    for_each_vcpu(d, v)
-    {
-        invalidate_shadow_ldt(v);
-
-        // We need to set these up for __update_pagetables().
-        // See the comment there.
-
-        /*
-         * arch.guest_vtable
-         */
-        if ( v->arch.guest_vtable &&
-             (v->arch.guest_vtable != __linear_l2_table) )
-        {
-            unmap_domain_page_global(v->arch.guest_vtable);
-        }
-        if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
-            v->arch.guest_vtable = __linear_l2_table;
-        else
-            v->arch.guest_vtable = NULL;
-
-        /*
-         * arch.shadow_vtable
-         */
-        if ( v->arch.shadow_vtable &&
-             (v->arch.shadow_vtable != __shadow_linear_l2_table) )
-        {
-            unmap_domain_page_global(v->arch.shadow_vtable);
-        }
-        if ( !(mode & SHM_external) && d->arch.ops->guest_paging_levels == 2)
-            v->arch.shadow_vtable = __shadow_linear_l2_table;
-        else
-            v->arch.shadow_vtable = NULL;
-        
-#if CONFIG_PAGING_LEVELS == 2
-        /*
-         * arch.hl2_vtable
-         */
-        if ( v->arch.hl2_vtable &&
-             (v->arch.hl2_vtable != __linear_hl2_table) )
-        {
-            unmap_domain_page_global(v->arch.hl2_vtable);
-        }
-        if ( (mode & (SHM_translate | SHM_external)) == SHM_translate )
-            v->arch.hl2_vtable = __linear_hl2_table;
-        else
-            v->arch.hl2_vtable = NULL;
-#endif
-        /*
-         * arch.monitor_table & arch.monitor_vtable
-         */
-        if ( v->arch.monitor_vtable )
-        {
-            free_monitor_pagetable(v);
-        }
-        if ( mode & SHM_external )
-        {
-            alloc_monitor_pagetable(v);
-        }
-    }
-
-    if ( new_modes & SHM_enable )
-    {
-        ASSERT( !d->arch.shadow_ht );
-        d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets);
-        if ( d->arch.shadow_ht == NULL )
-            goto nomem;
-
-        memset(d->arch.shadow_ht, 0,
-               shadow_ht_buckets * sizeof(struct shadow_status));
-    }
-
-    if ( new_modes & SHM_log_dirty )
-    {
-        ASSERT( !d->arch.shadow_dirty_bitmap );
-        d->arch.shadow_dirty_bitmap_size = 
-            (d->shared_info->arch.max_pfn +  63) & ~63;
-        d->arch.shadow_dirty_bitmap = 
-            xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size /
-                          (8 * sizeof(unsigned long)));
-        if ( d->arch.shadow_dirty_bitmap == NULL )
-        {
-            d->arch.shadow_dirty_bitmap_size = 0;
-            goto nomem;
-        }
-        memset(d->arch.shadow_dirty_bitmap, 0, 
-               d->arch.shadow_dirty_bitmap_size/8);
-    }
-
-    if ( new_modes & SHM_translate )
-    {
-        if ( !(new_modes & SHM_external) )
-        {
-            ASSERT( !pagetable_get_paddr(d->arch.phys_table) );
-            if ( !alloc_p2m_table(d) )
-            {
-                printk("alloc_p2m_table failed (out-of-memory?)\n");
-                goto nomem;
-            }
-        }
-    }
-
-    // Get rid of any shadow pages from any previous shadow mode.
-    //
-    free_shadow_pages(d);
-
-    d->arch.shadow_mode = mode;
-
-    if ( shadow_mode_refcounts(d) )
-    {
-        struct list_head *list_ent;
-        struct page_info *page;
-
-        /*
-         * Tear down its counts by disassembling its page-table-based refcounts
-         * Also remove CR3's gcount/tcount.
-         * That leaves things like GDTs and LDTs and external refs in tact.
-         *
-         * Most pages will be writable tcount=0.
-         * Some will still be L1 tcount=0 or L2 tcount=0.
-         * Maybe some pages will be type none tcount=0.
-         * Pages granted external writable refs (via grant tables?) will
-         * still have a non-zero tcount.  That's OK.
-         *
-         * gcounts will generally be 1 for PGC_allocated.
-         * GDTs and LDTs will have additional gcounts.
-         * Any grant-table based refs will still be in the gcount.
-         *
-         * We attempt to grab writable refs to each page thus setting its type
-         * Immediately put back those type refs.
-         *
-         * Assert that no pages are left with L1/L2/L3/L4 type.
-         */
-        audit_adjust_pgtables(d, -1, 1);
-
-
-        for (list_ent = d->page_list.next; list_ent != &d->page_list;
-             list_ent = page->list.next) {
-            
-            page = list_entry(list_ent, struct page_info, list);
-            if ( !get_page_type(page, PGT_writable_page) )
-                BUG();
-            put_page_type(page);
-            /*
-             * We use tlbflush_timestamp as back pointer to smfn, and need to
-             * clean up it.
-             */
-            if (shadow_mode_external(d))
-                page->tlbflush_timestamp = 0;
-        }
-        
-        audit_adjust_pgtables(d, 1, 1);
-  
-    }
-
-    return 0;
-
- nomem:
-    if ( (new_modes & SHM_enable) )
-    {
-        xfree(d->arch.shadow_ht);
-        d->arch.shadow_ht = NULL;
-    }
-    if ( (new_modes & SHM_log_dirty) )
-    {
-        xfree(d->arch.shadow_dirty_bitmap);
-        d->arch.shadow_dirty_bitmap = NULL;
-    }
-
-    return -ENOMEM;
-}
-
-
-int shadow_mode_enable(struct domain *d, unsigned int mode)
-{
-    int rc;
-    shadow_lock(d);
-    rc = __shadow_mode_enable(d, mode);
-    shadow_unlock(d);
-    return rc;
-}
-
-static int shadow_mode_table_op(
-    struct domain *d, dom0_shadow_control_t *sc)
-{
-    unsigned int      op = sc->op;
-    int               i, rc = 0;
-    struct vcpu *v;
-
-    ASSERT(shadow_lock_is_acquired(d));
-
-    SH_VLOG("shadow mode table op %lx %lx count %d",
-            (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.guest_table),  /* XXX SMP */
-            (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.shadow_table), /* XXX SMP */
-            d->arch.shadow_page_count);
-
-    shadow_audit(d, 1);
-
-    switch ( op )
-    {
-    case DOM0_SHADOW_CONTROL_OP_FLUSH:
-        free_shadow_pages(d);
-
-        d->arch.shadow_fault_count       = 0;
-        d->arch.shadow_dirty_count       = 0;
-
-        break;
-   
-    case DOM0_SHADOW_CONTROL_OP_CLEAN:
-        free_shadow_pages(d);
-
-        sc->stats.fault_count       = d->arch.shadow_fault_count;
-        sc->stats.dirty_count       = d->arch.shadow_dirty_count;
-
-        d->arch.shadow_fault_count       = 0;
-        d->arch.shadow_dirty_count       = 0;
-        if ( guest_handle_is_null(sc->dirty_bitmap) ||
-             (d->arch.shadow_dirty_bitmap == NULL) )
-        {
-            rc = -EINVAL;
-            break;
-        }
-
-        if ( sc->pages > d->arch.shadow_dirty_bitmap_size )
-            sc->pages = d->arch.shadow_dirty_bitmap_size;
-
-#define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
-        for ( i = 0; i < sc->pages; i += chunk )
-        {
-            int bytes = ((((sc->pages - i) > chunk) ?
-                          chunk : (sc->pages - i)) + 7) / 8;
-
-            if ( copy_to_guest_offset(
-                sc->dirty_bitmap, i/(8*sizeof(unsigned long)),
-                d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
-                (bytes+sizeof(unsigned long)-1) / sizeof(unsigned long)) )
-            {
-                rc = -EINVAL;
-                break;
-            }
-            memset(
-                d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
-                0, bytes);
-        }
-
-        break;
-
-    case DOM0_SHADOW_CONTROL_OP_PEEK:
-        sc->stats.fault_count       = d->arch.shadow_fault_count;
-        sc->stats.dirty_count       = d->arch.shadow_dirty_count;
-        if ( guest_handle_is_null(sc->dirty_bitmap) ||
-             (d->arch.shadow_dirty_bitmap == NULL) )
-        {
-            rc = -EINVAL;
-            break;
-        }
-        if ( sc->pages > d->arch.shadow_dirty_bitmap_size )
-            sc->pages = d->arch.shadow_dirty_bitmap_size;
-
-        if ( copy_to_guest(sc->dirty_bitmap, 
-                           d->arch.shadow_dirty_bitmap,
-                           (((sc->pages+7)/8)+sizeof(unsigned long)-1) /
-                           sizeof(unsigned long)) )
-        {
-            rc = -EINVAL;
-            break;
-        }
-
-        break;
-
-    default:
-        rc = -EINVAL;
-        break;
-    }
-
-    SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count);
-    shadow_audit(d, 1);
-
-    for_each_vcpu(d,v)
-        __update_pagetables(v);
-
-    return rc;
-}
-
-int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
-{
-    unsigned int op = sc->op;
-    int          rc = 0;
-    struct vcpu *v;
-
-    if ( unlikely(d == current->domain) )
-    {
-        DPRINTK("Don't try to do a shadow op on yourself!\n");
-        return -EINVAL;
-    }
-
-    domain_pause(d);
-
-    shadow_lock(d);
-
-    switch ( op )
-    {
-    case DOM0_SHADOW_CONTROL_OP_OFF:
-        if ( shadow_mode_enabled(d) )
-        {
-            __shadow_sync_all(d);
-            __shadow_mode_disable(d);
-        }
-        break;
-
-    case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
-        free_shadow_pages(d);
-        rc = __shadow_mode_enable(d, SHM_enable);
-        break;
-
-    case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
-        free_shadow_pages(d);
-        rc = __shadow_mode_enable(
-            d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty);
-        break;
-
-    case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
-        free_shadow_pages(d);
-        rc = __shadow_mode_enable(
-            d, d->arch.shadow_mode|SHM_enable|SHM_refcounts|SHM_translate);
-        break;
-
-    default:
-        rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL;
-        break;
-    }
-
-    shadow_unlock(d);
-
-    for_each_vcpu(d,v)
-        update_pagetables(v);
-
-    domain_unpause(d);
-
-    return rc;
-}
-
-void shadow_mode_init(void)
-{
-}
-
-int _shadow_mode_refcounts(struct domain *d)
-{
-    return shadow_mode_refcounts(d);
-}
-
-static int
-map_p2m_entry(pgentry_64_t *top_tab, unsigned long gpfn, unsigned long mfn)
-{
-#if CONFIG_PAGING_LEVELS >= 4
-    pgentry_64_t l4e = { 0 };
-    pgentry_64_t *l3tab = NULL;
-#endif
-#if CONFIG_PAGING_LEVELS >= 3
-    pgentry_64_t l3e = { 0 };
-#endif
-    l2_pgentry_t *l2tab = NULL;
-    l1_pgentry_t *l1tab = NULL;
-    unsigned long *l0tab = NULL;
-    l2_pgentry_t l2e = { 0 };
-    l1_pgentry_t l1e = { 0 };
-    struct page_info *page;
-    unsigned long va = RO_MPT_VIRT_START + (gpfn * sizeof(mfn));
-
-#if CONFIG_PAGING_LEVELS >= 4
-    l4e = top_tab[l4_table_offset(va)];
-    if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) )
-    {
-        page = alloc_domheap_page(NULL);
-        if ( !page )
-            goto nomem;
-
-        l3tab = map_domain_page(page_to_mfn(page));
-        memset(l3tab, 0, PAGE_SIZE);
-        l4e = top_tab[l4_table_offset(va)] =
-            entry_from_page(page, __PAGE_HYPERVISOR);
-    }
-    else
-        l3tab = map_domain_page(entry_get_pfn(l4e));
-
-    l3e = l3tab[l3_table_offset(va)];
-    if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
-    {
-        page = alloc_domheap_page(NULL);
-        if ( !page )
-            goto nomem;
-
-        l2tab = map_domain_page(page_to_mfn(page));
-        memset(l2tab, 0, PAGE_SIZE);
-        l3e = l3tab[l3_table_offset(va)] =
-            entry_from_page(page, __PAGE_HYPERVISOR);
-    }
-    else
-        l2tab = map_domain_page(entry_get_pfn(l3e));
-
-    unmap_domain_page(l3tab);
-#else
-    l3e = top_tab[l3_table_offset(va)];
-
-    /*
-     * NB: when CONFIG_PAGING_LEVELS == 3,
-     * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
-     * alloc_monitor_pagetable should guarantee this.
-     */
-    if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
-        BUG();
-
-    l2tab = map_domain_page(entry_get_pfn(l3e));
-#endif
-
-    l2e = l2tab[l2_table_offset(va)];
-    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
-    {
-        page = alloc_domheap_page(NULL);
-        if ( !page )
-            goto nomem;
-
-        l1tab = map_domain_page(page_to_mfn(page));
-        memset(l1tab, 0, PAGE_SIZE);
-        l2e = l2tab[l2_table_offset(va)] =
-            l2e_from_page(page, __PAGE_HYPERVISOR);
-    }
-    else
-        l1tab = map_domain_page(l2e_get_pfn(l2e));
-
-    unmap_domain_page(l2tab);
-
-    l1e = l1tab[l1_table_offset(va)];
-    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
-    {
-        page = alloc_domheap_page(NULL);
-        if ( !page )
-            goto nomem;
-
-        l0tab = map_domain_page(page_to_mfn(page));
-        memset(l0tab, 0, PAGE_SIZE);
-        l1e = l1tab[l1_table_offset(va)] =
-            l1e_from_page(page, __PAGE_HYPERVISOR);
-    }
-    else
-        l0tab = map_domain_page(l1e_get_pfn(l1e));
-
-    unmap_domain_page(l1tab);
-
-    l0tab[gpfn & ((PAGE_SIZE / sizeof(mfn)) - 1)] = mfn;
-
-    unmap_domain_page(l0tab);
-
-    return 1;
-
-nomem:
-    return 0;
-}
-
-int
-set_p2m_entry(struct domain *d, unsigned long gpfn, unsigned long mfn,
-              struct domain_mmap_cache *l2cache,
-              struct domain_mmap_cache *l1cache)
-{
-    unsigned long tabmfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
-    pgentry_64_t *top_tab;
-    int error;
-
-    ASSERT(tabmfn != 0);
-    ASSERT(shadow_lock_is_acquired(d));
-
-    top_tab = map_domain_page_with_cache(tabmfn, l2cache);
-
-    if ( !(error = map_p2m_entry(top_tab, gpfn, mfn)) )
-        domain_crash(d);
-
-    unmap_domain_page_with_cache(top_tab, l2cache);
-
-    return error;
-}
-
-static int
-alloc_p2m_table(struct domain *d)
-{
-    struct list_head *list_ent;
-    pgentry_64_t *top_tab = NULL;
-    unsigned long gpfn, mfn;
-    int error = 0;
-
-    ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
-
-    top_tab = map_domain_page(
-        pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
-
-    list_ent = d->page_list.next;
-
-    while ( list_ent != &d->page_list )
-    {
-        struct page_info *page;
-
-        page = list_entry(list_ent, struct page_info, list);
-        mfn = page_to_mfn(page);
-
-        gpfn = get_gpfn_from_mfn(mfn);
-
-        if ( !(error = map_p2m_entry(top_tab, gpfn, mfn)) )
-        {
-            domain_crash(d);
-            break;
-        }
-
-        list_ent = page->list.next;
-    }
-
-    unmap_domain_page(top_tab);
-
-    return error;
-}
-
-#if CONFIG_PAGING_LEVELS >= 3
-static void
-free_p2m_table(struct domain *d)
-{
-    unsigned long va;
-    l1_pgentry_t *l1tab;
-    l1_pgentry_t l1e;
-    l2_pgentry_t *l2tab;
-    l2_pgentry_t l2e;
-#if CONFIG_PAGING_LEVELS >= 3
-    l3_pgentry_t *l3tab;
-    l3_pgentry_t l3e;
-#endif
-#if CONFIG_PAGING_LEVELS == 4
-    int i3;
-    l4_pgentry_t *l4tab;
-    l4_pgentry_t l4e;
-#endif
-
-    ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
-
-#if CONFIG_PAGING_LEVELS == 4
-    l4tab = map_domain_page(
-        pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
-#endif
-#if CONFIG_PAGING_LEVELS == 3
-    l3tab = map_domain_page(
-        pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
-
-    l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)];
-
-    /*
-     * NB: when CONFIG_PAGING_LEVELS == 3,
-     * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
-     * alloc_monitor_pagetable should guarantee this.
-     */
-    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
-        BUG();
-
-    l2tab = map_domain_page(l3e_get_pfn(l3e));
-#endif
-
-    for ( va = RO_MPT_VIRT_START; va < RO_MPT_VIRT_END; )
-    {
-#if CONFIG_PAGING_LEVELS == 4
-        l4e = l4tab[l4_table_offset(va)];
-
-        if ( l4e_get_flags(l4e) & _PAGE_PRESENT )
-        {
-            l3tab = map_domain_page(l4e_get_pfn(l4e));
-
-            for ( i3 = 0; i3 < L3_PAGETABLE_ENTRIES; i3++ )
-            {
-                l3e = l3tab[l3_table_offset(va)];
-
-                if ( l3e_get_flags(l3e) & _PAGE_PRESENT )
-                {
-                    int i2;
-
-                    l2tab = map_domain_page(l3e_get_pfn(l3e));
-
-                    for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
-                    {
-#endif
-                        l2e = l2tab[l2_table_offset(va)];
-
-                        if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
-                        {
-                            int i1;
-
-                            l1tab = map_domain_page(l2e_get_pfn(l2e));
-
-                            /*
-                             * unsigned long phys_to_machine_mapping[]
-                             */
-                            for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++ )
-                            {
-                                l1e = l1tab[l1_table_offset(va)];
-
-                                if ( l1e_get_flags(l1e) & _PAGE_PRESENT )
-                                    free_domheap_page(mfn_to_page(l1e_get_pfn(l1e)));
-
-                                va += PAGE_SIZE;
-                            }
-                            unmap_domain_page(l1tab);
-                            free_domheap_page(mfn_to_page(l2e_get_pfn(l2e)));
-                        }
-                        else
-                            va += PAGE_SIZE * L1_PAGETABLE_ENTRIES;
-
-#if CONFIG_PAGING_LEVELS == 4
-                    }
-                    unmap_domain_page(l2tab);
-                    free_domheap_page(mfn_to_page(l3e_get_pfn(l3e)));
-                }
-                else
-                    va += PAGE_SIZE * L1_PAGETABLE_ENTRIES * L2_PAGETABLE_ENTRIES;
-            }
-            unmap_domain_page(l3tab);
-            free_domheap_page(mfn_to_page(l4e_get_pfn(l4e)));
-        }
-        else
-            va += PAGE_SIZE *
-                L1_PAGETABLE_ENTRIES * L2_PAGETABLE_ENTRIES * L3_PAGETABLE_ENTRIES;
-#endif
-    }
-
-#if CONFIG_PAGING_LEVELS == 4
-    unmap_domain_page(l4tab);
-#endif
-#if CONFIG_PAGING_LEVELS == 3
-    unmap_domain_page(l3tab);
-#endif
-}
-#endif
-
-void shadow_l1_normal_pt_update(
-    struct domain *d,
-    paddr_t pa, l1_pgentry_t gpte,
-    struct domain_mmap_cache *cache)
-{
-    unsigned long sl1mfn;
-    l1_pgentry_t *spl1e, spte;
-
-    shadow_lock(d);
-
-    sl1mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l1_shadow);
-    if ( sl1mfn )
-    {
-        SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpde=%" PRIpte,
-                 (void *)pa, l1e_get_intpte(gpte));
-        l1pte_propagate_from_guest(current->domain, gpte, &spte);
-
-        spl1e = map_domain_page_with_cache(sl1mfn, cache);
-        spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = spte;
-        unmap_domain_page_with_cache(spl1e, cache);
-    }
-
-    shadow_unlock(d);
-}
-
-void shadow_l2_normal_pt_update(
-    struct domain *d,
-    paddr_t pa, l2_pgentry_t gpde,
-    struct domain_mmap_cache *cache)
-{
-    unsigned long sl2mfn;
-    l2_pgentry_t *spl2e;
-
-    shadow_lock(d);
-
-    sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l2_shadow);
-    if ( sl2mfn )
-    {
-        SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%" PRIpte,
-                 (void *)pa, l2e_get_intpte(gpde));
-        spl2e = map_domain_page_with_cache(sl2mfn, cache);
-        validate_pde_change(d, gpde,
-                            &spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)]);
-        unmap_domain_page_with_cache(spl2e, cache);
-    }
-
-    shadow_unlock(d);
-}
-
-#if CONFIG_PAGING_LEVELS >= 3
-void shadow_l3_normal_pt_update(
-    struct domain *d,
-    paddr_t pa, l3_pgentry_t l3e,
-    struct domain_mmap_cache *cache)
-{
-    unsigned long sl3mfn;
-    pgentry_64_t *spl3e;
-
-    shadow_lock(d);
-
-    sl3mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l3_shadow);
-    if ( sl3mfn )
-    {
-        SH_VVLOG("shadow_l3_normal_pt_update pa=%p, l3e=%" PRIpte,
-                 (void *)pa, l3e_get_intpte(l3e));
-        spl3e = (pgentry_64_t *) map_domain_page_with_cache(sl3mfn, cache);
-        validate_entry_change(d, (pgentry_64_t *) &l3e,
-                              &spl3e[(pa & ~PAGE_MASK) / sizeof(l3_pgentry_t)], 
-                              shadow_type_to_level(PGT_l3_shadow));
-        unmap_domain_page_with_cache(spl3e, cache);
-    }
-
-    shadow_unlock(d);
-}
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 4
-void shadow_l4_normal_pt_update(
-    struct domain *d,
-    paddr_t pa, l4_pgentry_t l4e,
-    struct domain_mmap_cache *cache)
-{
-    unsigned long sl4mfn;
-    pgentry_64_t *spl4e;
-
-    shadow_lock(d);
-
-    sl4mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l4_shadow);
-    if ( sl4mfn )
-    {
-        SH_VVLOG("shadow_l4_normal_pt_update pa=%p, l4e=%" PRIpte,
-                 (void *)pa, l4e_get_intpte(l4e));
-        spl4e = (pgentry_64_t *)map_domain_page_with_cache(sl4mfn, cache);
-        validate_entry_change(d, (pgentry_64_t *)&l4e,
-                              &spl4e[(pa & ~PAGE_MASK) / sizeof(l4_pgentry_t)], 
-                              shadow_type_to_level(PGT_l4_shadow));
-        unmap_domain_page_with_cache(spl4e, cache);
-    }
-
-    shadow_unlock(d);
-}
-#endif
-
-static void
-translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn)
-{
-    int i;
-    l1_pgentry_t *l1;
-
-    l1 = map_domain_page(l1mfn);
-    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
-    {
-        if ( is_guest_l1_slot(i) &&
-             (l1e_get_flags(l1[i]) & _PAGE_PRESENT) )
-        {
-            unsigned long mfn = l1e_get_pfn(l1[i]);
-            unsigned long gpfn = mfn_to_gmfn(d, mfn);
-            ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
-            l1[i] = l1e_from_pfn(gpfn, l1e_get_flags(l1[i]));
-        }
-    }
-    unmap_domain_page(l1);
-}
-
-// This is not general enough to handle arbitrary pagetables
-// with shared L1 pages, etc., but it is sufficient for bringing
-// up dom0.
-//
-void
-translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn,
-                    unsigned int type)
-{
-    int i;
-    l2_pgentry_t *l2;
-
-    ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d));
-
-    l2 = map_domain_page(l2mfn);
-    for (i = 0; i < L2_PAGETABLE_ENTRIES; i++)
-    {
-        if ( is_guest_l2_slot(type, i) &&
-             (l2e_get_flags(l2[i]) & _PAGE_PRESENT) )
-        {
-            unsigned long mfn = l2e_get_pfn(l2[i]);
-            unsigned long gpfn = mfn_to_gmfn(d, mfn);
-            ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn);
-            l2[i] = l2e_from_pfn(gpfn, l2e_get_flags(l2[i]));
-            translate_l1pgtable(d, p2m, mfn);
-        }
-    }
-    unmap_domain_page(l2);
-}
-
-void
-remove_shadow(struct domain *d, unsigned long gpfn, u32 stype)
-{
-    unsigned long smfn;
-
-    shadow_lock(d);
-
-    while ( stype >= PGT_l1_shadow )
-    {
-        smfn = __shadow_status(d, gpfn, stype);
-        if ( smfn && MFN_PINNED(smfn) )
-            shadow_unpin(smfn);
-        stype -= PGT_l1_shadow;
-    }
-
-    shadow_unlock(d);
-}
-
-unsigned long
-get_mfn_from_gpfn_foreign(struct domain *d, unsigned long gpfn)
-{
-    unsigned long va, tabpfn;
-    l1_pgentry_t *l1, l1e;
-    l2_pgentry_t *l2, l2e;
-#if CONFIG_PAGING_LEVELS >= 4
-    pgentry_64_t *l4 = NULL;
-    pgentry_64_t l4e = { 0 };
-#endif
-    pgentry_64_t *l3 = NULL;
-    pgentry_64_t l3e = { 0 };
-    unsigned long *l0tab = NULL;
-    unsigned long mfn;
-
-    ASSERT(shadow_mode_translate(d));
-
-    perfc_incrc(get_mfn_from_gpfn_foreign);
-
-    va = RO_MPT_VIRT_START + (gpfn * sizeof(mfn));
-
-    tabpfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
-    if ( !tabpfn )
-        return INVALID_MFN;
-
-#if CONFIG_PAGING_LEVELS >= 4
-    l4 = map_domain_page(tabpfn);
-    l4e = l4[l4_table_offset(va)];
-    unmap_domain_page(l4);
-    if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) )
-        return INVALID_MFN;
-
-    l3 = map_domain_page(entry_get_pfn(l4e));
-#else
-    l3 = map_domain_page(tabpfn);
-#endif
-    l3e = l3[l3_table_offset(va)];
-    unmap_domain_page(l3);
-    if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
-        return INVALID_MFN;
-    l2 = map_domain_page(entry_get_pfn(l3e));
-    l2e = l2[l2_table_offset(va)];
-    unmap_domain_page(l2);
-    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
-        return INVALID_MFN;
-
-    l1 = map_domain_page(l2e_get_pfn(l2e));
-    l1e = l1[l1_table_offset(va)];
-    unmap_domain_page(l1);
-    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
-        return INVALID_MFN;
-
-    l0tab = map_domain_page(l1e_get_pfn(l1e));
-    mfn = l0tab[gpfn & ((PAGE_SIZE / sizeof (mfn)) - 1)];
-    unmap_domain_page(l0tab);
-    return mfn;
-}
-
-static u32 remove_all_access_in_page(
-    struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn)
-{
-    l1_pgentry_t *pl1e = map_domain_page(l1mfn);
-    l1_pgentry_t match, ol2e;
-    unsigned long flags  = _PAGE_PRESENT;
-    int i;
-    u32 count = 0;
-    int is_l1_shadow =
-        ((mfn_to_page(l1mfn)->u.inuse.type_info & PGT_type_mask) ==
-         PGT_l1_shadow);
-
-    match = l1e_from_pfn(forbidden_gmfn, flags);
-
-    for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
-    {
-        if ( l1e_has_changed(pl1e[i], match, flags) )
-            continue;
-
-        ol2e = pl1e[i];
-        pl1e[i] = l1e_empty();
-        count++;
-
-        if ( is_l1_shadow )
-            shadow_put_page_from_l1e(ol2e, d);
-        else /* must be an hl2 page */
-            put_page(mfn_to_page(forbidden_gmfn));
-    }
-
-    unmap_domain_page(pl1e);
-
-    return count;
-}
-
-static u32 __shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn)
-{
-    int i;
-    struct shadow_status *a;
-    u32 count = 0;
-
-    if ( unlikely(!shadow_mode_enabled(d)) )
-        return 0;
-
-    ASSERT(shadow_lock_is_acquired(d));
-    perfc_incrc(remove_all_access);
-
-    for (i = 0; i < shadow_ht_buckets; i++)
-    {
-        a = &d->arch.shadow_ht[i];
-        while ( a && a->gpfn_and_flags )
-        {
-            switch (a->gpfn_and_flags & PGT_type_mask)
-            {
-            case PGT_l1_shadow:
-            case PGT_l2_shadow:
-            case PGT_l3_shadow:
-            case PGT_l4_shadow:
-            case PGT_hl2_shadow:
-                count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn);
-                break;
-            case PGT_snapshot:
-            case PGT_writable_pred:
-                // these can't hold refs to the forbidden page
-                break;
-            default:
-                BUG();
-            }
-
-            a = a->next;
-        }
-    }
-
-    return count;
-}
-
-void shadow_drop_references(
-    struct domain *d, struct page_info *page)
-{
-    if ( likely(!shadow_mode_refcounts(d)) ||
-         ((page->u.inuse.type_info & PGT_count_mask) == 0) )
-        return;
-
-    /* XXX This needs more thought... */
-    printk("%s: needing to call __shadow_remove_all_access for mfn=%lx\n",
-           __func__, page_to_mfn(page));
-    printk("Before: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page),
-           page->count_info, page->u.inuse.type_info);
-
-    shadow_lock(d);
-    __shadow_remove_all_access(d, page_to_mfn(page));
-    shadow_unlock(d);
-
-    printk("After:  mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page),
-           page->count_info, page->u.inuse.type_info);
-}
-
-/* XXX Needs more thought. Neither pretty nor fast: a place holder. */
-void shadow_sync_and_drop_references(
-    struct domain *d, struct page_info *page)
-{
-    if ( likely(!shadow_mode_refcounts(d)) )
-        return;
-
-    shadow_lock(d);
-
-    if ( page_out_of_sync(page) )
-        __shadow_sync_mfn(d, page_to_mfn(page));
-
-    __shadow_remove_all_access(d, page_to_mfn(page));
-
-    shadow_unlock(d);
-}
-
-void clear_all_shadow_status(struct domain *d)
-{
-    struct vcpu *v = current;
-
-    /*
-     * Don't clean up while other vcpus are working.
-     */
-    if ( v->vcpu_id )
-        return;
-
-    shadow_lock(d);
-
-    free_shadow_pages(d);
-    free_shadow_ht_entries(d);
-    d->arch.shadow_ht = 
-        xmalloc_array(struct shadow_status, shadow_ht_buckets);
-    if ( d->arch.shadow_ht == NULL ) {
-        printk("clear all shadow status:xmalloc fail\n");
-        domain_crash_synchronous();
-    }
-    memset(d->arch.shadow_ht, 0,
-           shadow_ht_buckets * sizeof(struct shadow_status));
-
-    free_out_of_sync_entries(d);
-
-    shadow_unlock(d);
-}
-
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
index a78ed07d2643f5eb5ca8f40657f9d67648eb10ee..734bd41797a56284c01dff6bdbf8588bc59d2ace 100644 (file)
@@ -896,7 +896,7 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
        v = alloc_idle_vcpu(cpu);
        BUG_ON(v == NULL);
 
-       v->arch.monitor_table = pagetable_from_paddr(__pa(idle_pg_table));
+       v->arch.cr3 = __pa(idle_pg_table);
 
        /* start_eip had better be page-aligned! */
        start_eip = setup_trampoline();
index 87f9a4fd429c42708f88c9bb6f9022eabaafb35f..2d398712fed785471f7f8ae59234f6040fa227d2 100644 (file)
@@ -277,6 +277,21 @@ void show_stack(struct cpu_user_regs *regs)
     show_trace(regs);
 }
 
+void show_xen_trace()
+{
+    struct cpu_user_regs regs;
+#ifdef __x86_64
+    __asm__("movq %%rsp,%0" : "=m" (regs.rsp));
+    __asm__("movq %%rbp,%0" : "=m" (regs.rbp));
+    __asm__("leaq 0(%%rip),%0" : "=a" (regs.rip));
+#else
+    __asm__("movl %%esp,%0" : "=m" (regs.esp));
+    __asm__("movl %%ebp,%0" : "=m" (regs.ebp));
+    __asm__("call 1f; 1: popl %0" : "=a" (regs.eip));
+#endif
+    show_trace(&regs);
+}
+
 void show_stack_overflow(unsigned long esp)
 {
 #ifdef MEMORY_GUARD
@@ -861,8 +876,8 @@ static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
 
     if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
     {
-        if ( shadow_mode_external(d) && guest_mode(regs) )
-            return shadow_fault(addr, regs);
+        if ( shadow2_mode_external(d) && guest_mode(regs) )
+            return shadow2_fault(addr, regs);
         if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
             return handle_gdt_ldt_mapping_fault(
                 addr - GDT_LDT_VIRT_START, regs);
@@ -873,15 +888,15 @@ static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
         return (spurious_page_fault(addr, regs) ? EXCRET_not_a_fault : 0);
     }
 
-    if ( unlikely(shadow_mode_enabled(d)) )
-        return shadow_fault(addr, regs);
-
     if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) &&
          guest_kernel_mode(v, regs) &&
          ((regs->error_code & (PGERR_write_access|PGERR_page_present)) ==
           (PGERR_write_access|PGERR_page_present)) )
         return ptwr_do_page_fault(d, addr, regs) ? EXCRET_fault_fixed : 0;
 
+    if ( shadow2_mode_enabled(d) )
+        return shadow2_fault(addr, regs);
+
     return 0;
 }
 
@@ -906,6 +921,13 @@ asmlinkage int do_page_fault(struct cpu_user_regs *regs)
 
     perfc_incrc(page_faults);
 
+    if ( shadow2_mode_enabled(current->domain) )
+        debugtrace_printk("%s %s %d dom=%d eip=%p cr2=%p code=%d cs=%x\n",
+                          __func__, __FILE__, __LINE__,
+                          current->domain->domain_id,
+                          (void *)regs->eip, (void *)addr, regs->error_code,
+                          regs->cs);
+
     if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
         return rc;
 
index db3237242c4bf8253e2c63f1a5af5eba39810902..8fe7b9b34404f09e285329c5cfcc380fd1a346c7 100644 (file)
@@ -15,6 +15,7 @@
 #include <asm/current.h>
 #include <asm/flushtlb.h>
 #include <asm/hardirq.h>
+#include <asm/hvm/support.h>
 
 static inline struct vcpu *mapcache_current_vcpu(void)
 {
@@ -58,10 +59,10 @@ void *map_domain_page(unsigned long pfn)
     cache = &v->domain->arch.mapcache;
 
     hashent = &cache->vcpu_maphash[vcpu].hash[MAPHASH_HASHFN(pfn)];
-    if ( hashent->pfn == pfn )
+    if ( hashent->pfn == pfn && (idx = hashent->idx) != MAPHASHENT_NOTINUSE )
     {
-        idx = hashent->idx;
         hashent->refcnt++;
+        ASSERT(idx < MAPCACHE_ENTRIES);
         ASSERT(hashent->refcnt != 0);
         ASSERT(l1e_get_pfn(cache->l1tab[idx]) == pfn);
         goto out;
@@ -178,6 +179,30 @@ void mapcache_init(struct domain *d)
                 MAPHASHENT_NOTINUSE;
 }
 
+paddr_t mapped_domain_page_to_maddr(void *va) 
+/* Convert a pointer in a mapped domain page to a machine address. 
+ * Takes any pointer that's valid for use in unmap_domain_page() */
+{
+    unsigned int idx;
+    struct vcpu *v;
+    struct mapcache *cache;
+    unsigned long pfn;
+
+    ASSERT(!in_irq());
+
+    ASSERT((void *)MAPCACHE_VIRT_START <= va);
+    ASSERT(va < (void *)MAPCACHE_VIRT_END);
+
+    v = mapcache_current_vcpu();
+
+    cache = &v->domain->arch.mapcache;
+
+    idx = ((unsigned long)va - MAPCACHE_VIRT_START) >> PAGE_SHIFT;
+    pfn = l1e_get_pfn(cache->l1tab[idx]);
+    return ((paddr_t) pfn << PAGE_SHIFT 
+            | ((unsigned long) va & ~PAGE_MASK));
+}
+
 #define GLOBALMAP_BITS (IOREMAP_MBYTES << (20 - PAGE_SHIFT))
 static unsigned long inuse[BITS_TO_LONGS(GLOBALMAP_BITS)];
 static unsigned long garbage[BITS_TO_LONGS(GLOBALMAP_BITS)];
@@ -233,6 +258,8 @@ void unmap_domain_page_global(void *va)
     l1_pgentry_t *pl1e;
     unsigned int idx;
 
+    ASSERT((__va >= IOREMAP_VIRT_START) && (__va <= (IOREMAP_VIRT_END - 1)));
+
     /* /First/, we zap the PTE. */
     pl2e = virt_to_xen_l2e(__va);
     pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(__va);
index 868140e586f5fb4f2de8844fd32aab8abf655810..dc2450201a7c381c37248a77f231f56dce47fa0e 100644 (file)
@@ -75,8 +75,7 @@ void __init paging_init(void)
     printk("PAE disabled.\n");
 #endif
 
-    idle_vcpu[0]->arch.monitor_table =
-        pagetable_from_paddr(__pa(idle_pg_table));
+    idle_vcpu[0]->arch.cr3 = __pa(idle_pg_table);
 
     if ( cpu_has_pge )
     {
index d5db7f3b3030af250a79eaaac76794264d08e57b..f173c05d831048c38ea20e5af41445abe2525bad 100644 (file)
@@ -81,8 +81,7 @@ void __init paging_init(void)
     l2_pgentry_t *l2_ro_mpt;
     struct page_info *pg;
 
-    idle_vcpu[0]->arch.monitor_table =
-        pagetable_from_paddr(__pa(idle_pg_table));
+    idle_vcpu[0]->arch.cr3 = __pa(idle_pg_table);
 
     /* Create user-accessible L2 directory to map the MPT for guests. */
     l3_ro_mpt = alloc_xenheap_page();
index cfe2a6a5a00fa25f20138d471525f8749a85596b..84c9c35952f0b76bbed55b2daf4844cbc2d7cf1d 100644 (file)
@@ -84,7 +84,8 @@ void show_page_walk(unsigned long addr)
     l4e = l4t[l4_table_offset(addr)];
     mfn = l4e_get_pfn(l4e);
     pfn = get_gpfn_from_mfn(mfn);
-    printk(" L4 = %"PRIpte" %016lx\n", l4e_get_intpte(l4e), pfn);
+    printk(" L4[0x%lx] = %"PRIpte" %016lx\n",
+           l4_table_offset(addr), l4e_get_intpte(l4e), pfn);
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
         return;
 
@@ -92,7 +93,8 @@ void show_page_walk(unsigned long addr)
     l3e = l3t[l3_table_offset(addr)];
     mfn = l3e_get_pfn(l3e);
     pfn = get_gpfn_from_mfn(mfn);
-    printk("  L3 = %"PRIpte" %016lx\n", l3e_get_intpte(l3e), pfn);
+    printk("  L3[0x%lx] = %"PRIpte" %016lx\n",
+           l3_table_offset(addr), l3e_get_intpte(l3e), pfn);
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
         return;
 
@@ -100,7 +102,8 @@ void show_page_walk(unsigned long addr)
     l2e = l2t[l2_table_offset(addr)];
     mfn = l2e_get_pfn(l2e);
     pfn = get_gpfn_from_mfn(mfn);
-    printk("   L2 = %"PRIpte" %016lx %s\n", l2e_get_intpte(l2e), pfn,
+    printk("   L2[0x%lx] = %"PRIpte" %016lx %s\n",
+           l2_table_offset(addr), l2e_get_intpte(l2e), pfn,
            (l2e_get_flags(l2e) & _PAGE_PSE) ? "(PSE)" : "");
     if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
          (l2e_get_flags(l2e) & _PAGE_PSE) )
@@ -110,7 +113,8 @@ void show_page_walk(unsigned long addr)
     l1e = l1t[l1_table_offset(addr)];
     mfn = l1e_get_pfn(l1e);
     pfn = get_gpfn_from_mfn(mfn);
-    printk("    L1 = %"PRIpte" %016lx\n", l1e_get_intpte(l1e), pfn);
+    printk("    L1[0x%lx] = %"PRIpte" %016lx\n",
+           l1_table_offset(addr), l1e_get_intpte(l1e), pfn);
 }
 
 asmlinkage void double_fault(void);
@@ -162,7 +166,7 @@ void toggle_guest_mode(struct vcpu *v)
 {
     v->arch.flags ^= TF_kernel_mode;
     __asm__ __volatile__ ( "swapgs" );
-    update_pagetables(v);
+    update_cr3(v);
     write_ptbase(v);
 }
 
index 6c65612799856d3e7eefef7ee90ead1845982110..36925778738a4152084a7820ad104b612a5728b3 100644 (file)
@@ -26,7 +26,6 @@
 #include <xen/trace.h>
 #include <xen/console.h>
 #include <xen/guest_access.h>
-#include <asm/shadow.h>
 #include <public/sched_ctl.h>
 #include <acm/acm_hooks.h>
 
index ad33217711d7fe1d6af78d3af462541db435b511..c8ba26071143c42899fc0ee0dd2a4eaec21f5c66 100644 (file)
@@ -434,7 +434,7 @@ __gnttab_unmap_grant_ref(
 
     /* If just unmapped a writable mapping, mark as dirtied */
     if ( !(flags & GNTMAP_readonly) )
-         gnttab_log_dirty(rd, frame);
+         gnttab_mark_dirty(rd, frame);
 
     if ( ((act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)) == 0) &&
          !(flags & GNTMAP_readonly) )
@@ -731,7 +731,7 @@ __release_grant_for_copy(
     const unsigned long r_frame = act->frame;
 
     if ( !readonly )
-        gnttab_log_dirty(rd, r_frame);
+        gnttab_mark_dirty(rd, r_frame);
 
     spin_lock(&rd->grant_table->lock);
     if ( readonly )
index fb7118e71f3d18a3ccba5abf0ef33c623c9534b8..1fb50b6bd2b2590c26584d946902bc4a3e9ca3bc 100644 (file)
@@ -241,9 +241,6 @@ static void read_clocks(unsigned char key)
 }
 
 extern void dump_runq(unsigned char key);
-#ifndef NDEBUG
-extern void audit_domains_key(unsigned char key);
-#endif
 
 #ifdef PERF_COUNTERS
 extern void perfc_printall(unsigned char key);
@@ -261,10 +258,16 @@ static void do_debug_key(unsigned char key, struct cpu_user_regs *regs)
 #ifndef NDEBUG
 static void debugtrace_key(unsigned char key)
 {
-    debugtrace_send_to_console = !debugtrace_send_to_console;
-    debugtrace_dump();
-    printk("debugtrace_printk now writing to %s.\n",
-           debugtrace_send_to_console ? "console" : "buffer");
+    debugtrace_toggle();
+}
+
+static void shadow2_audit_key(unsigned char key)
+{
+    extern int shadow2_audit_enable;
+
+    shadow2_audit_enable = !shadow2_audit_enable;
+    printk("%s shadow2_audit_enable=%d\n",
+           __func__, shadow2_audit_enable);
 }
 #endif
 
@@ -288,7 +291,7 @@ void initialize_keytable(void)
 
 #ifndef NDEBUG
     register_keyhandler(
-        'o', audit_domains_key,  "audit domains >0 EXPERIMENTAL");
+        'O', shadow2_audit_key,  "toggle shadow2 audits");
     register_keyhandler(
         'T', debugtrace_key, "toggle debugtrace to console/buffer");
 #endif
index 0a631ca83e8a980993a029353733b5787e8daadc..9962c2e89a95d0d6738200f50d571d8b66eeae2a 100644 (file)
@@ -126,6 +126,11 @@ populate_physmap(
             for ( j = 0; j < (1 << extent_order); j++ )
                 guest_physmap_add_page(d, gpfn + j, mfn + j);
         }
+        else if ( unlikely(shadow2_mode_translate(d)) )
+        {
+            for ( j = 0; j < (1 << extent_order); j++ )
+                shadow2_guest_physmap_add_page(d, gpfn + j, mfn + j);
+        }
         else
         {
             for ( j = 0; j < (1 << extent_order); j++ )
@@ -153,7 +158,7 @@ guest_remove_page(
     if ( unlikely(!mfn_valid(mfn)) )
     {
         DPRINTK("Domain %u page number %lx invalid\n",
-                d->domain_id, mfn);
+                d->domain_id, gmfn);
         return 0;
     }
             
@@ -179,7 +184,7 @@ guest_remove_page(
                 (unsigned long)page->count_info, page->u.inuse.type_info);
     }
 
-    guest_physmap_remove_page(d, gmfn, mfn);
+    shadow2_guest_physmap_remove_page(d, gmfn, mfn);
 
     put_page(page);
 
@@ -250,7 +255,7 @@ translate_gpfn_list(
     if ( (d = find_domain_by_id(op.domid)) == NULL )
         return -ESRCH;
 
-    if ( !shadow_mode_translate(d) )
+    if ( !(shadow_mode_translate(d) || shadow2_mode_translate(d)) )
     {
         put_domain(d);
         return -EINVAL;
index 8bd1c28915060b43ab3a918f7edb738905845d4c..974f6e3d8e96c28f1ac4554a2ef03a7460d6d12f 100644 (file)
@@ -569,7 +569,7 @@ int console_getc(void)
 #ifndef NDEBUG
 
 /* Send output direct to console, or buffer it? */
-int debugtrace_send_to_console;
+static volatile int debugtrace_send_to_console;
 
 static char        *debugtrace_buf; /* Debug-trace buffer */
 static unsigned int debugtrace_prd; /* Producer index     */
@@ -578,17 +578,11 @@ static unsigned int debugtrace_used;
 static DEFINE_SPINLOCK(debugtrace_lock);
 integer_param("debugtrace", debugtrace_kilobytes);
 
-void debugtrace_dump(void)
+static void debugtrace_dump_worker(void)
 {
-    unsigned long flags;
-
     if ( (debugtrace_bytes == 0) || !debugtrace_used )
         return;
 
-    watchdog_disable();
-
-    spin_lock_irqsave(&debugtrace_lock, flags);
-
     printk("debugtrace_dump() starting\n");
 
     /* Print oldest portion of the ring. */
@@ -602,15 +596,47 @@ void debugtrace_dump(void)
     memset(debugtrace_buf, '\0', debugtrace_bytes);
 
     printk("debugtrace_dump() finished\n");
+}
+
+void debugtrace_toggle(void)
+{
+    unsigned long flags;
+
+    watchdog_disable();
+    spin_lock_irqsave(&debugtrace_lock, flags);
+
+    // dump the buffer *before* toggling, in case the act of dumping the
+    // buffer itself causes more printk's...
+    //
+    printk("debugtrace_printk now writing to %s.\n",
+           !debugtrace_send_to_console ? "console": "buffer");
+    if ( !debugtrace_send_to_console )
+        debugtrace_dump_worker();
+
+    debugtrace_send_to_console = !debugtrace_send_to_console;
 
     spin_unlock_irqrestore(&debugtrace_lock, flags);
+    watchdog_enable();
+
+}
+
+void debugtrace_dump(void)
+{
+    unsigned long flags;
 
+    watchdog_disable();
+    spin_lock_irqsave(&debugtrace_lock, flags);
+
+    debugtrace_dump_worker();
+
+    spin_unlock_irqrestore(&debugtrace_lock, flags);
     watchdog_enable();
 }
 
 void debugtrace_printk(const char *fmt, ...)
 {
     static char    buf[1024];
+    static u32 count;
 
     va_list       args;
     char         *p;
@@ -625,8 +651,10 @@ void debugtrace_printk(const char *fmt, ...)
 
     ASSERT(debugtrace_buf[debugtrace_bytes - 1] == 0);
 
+    sprintf(buf, "%u ", ++count);
+
     va_start(args, fmt);
-    (void)vsnprintf(buf, sizeof(buf), fmt, args);
+    (void)vsnprintf(buf + strlen(buf), sizeof(buf), fmt, args);
     va_end(args);
 
     if ( debugtrace_send_to_console )
index b2ee953361f1161a9b8a8f585d87e8705b66477b..b9fd2557d0a370776a62f30539298cc252aabff7 100644 (file)
@@ -75,6 +75,24 @@ static __inline__ void clear_bit(int nr, volatile void * addr)
                :"=m" (ADDR)
                :"dIr" (nr));
 }
+
+/**
+ * __clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * Unlike clear_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static __inline__ void __clear_bit(int nr, volatile void * addr)
+{
+       __asm__(
+               "btrl %1,%0"
+               :"=m" (ADDR)
+               :"dIr" (nr));
+}
+
 #define smp_mb__before_clear_bit()     barrier()
 #define smp_mb__after_clear_bit()      barrier()
 
index 99c74cf5ad54e347b1a65262e42a4b23ac7b5784..74a123de6f85637dce81709570772f6ed056d407 100644 (file)
 
 #ifndef __ASSEMBLY__
 extern unsigned long _end; /* standard ELF symbol */
-#endif /* __ASSEMBLY__ */
 
-#define FORCE_CRASH() __asm__ __volatile__ ( "ud2" )
+static inline void FORCE_CRASH(void) __attribute__((noreturn,always_inline)); 
+static inline void FORCE_CRASH(void) 
+{
+    __asm__ __volatile__ ( "ud2" );
+    while(1);
+}
+#endif /* __ASSEMBLY__ */
 
 #if defined(__x86_64__)
 
@@ -149,9 +154,14 @@ extern unsigned long _end; /* standard ELF symbol */
 /* Slot 256: read-only guest-accessible machine-to-phys translation table. */
 #define RO_MPT_VIRT_START       (PML4_ADDR(256))
 #define RO_MPT_VIRT_END         (RO_MPT_VIRT_START + PML4_ENTRY_BYTES/2)
+
+// current unused?
+#if 0
 /* Slot 257: read-only guest-accessible linear page table. */
 #define RO_LINEAR_PT_VIRT_START (PML4_ADDR(257))
 #define RO_LINEAR_PT_VIRT_END   (RO_LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
+#endif
+
 /* Slot 258: linear page table (guest table). */
 #define LINEAR_PT_VIRT_START    (PML4_ADDR(258))
 #define LINEAR_PT_VIRT_END      (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
@@ -175,7 +185,7 @@ extern unsigned long _end; /* standard ELF symbol */
 #define DIRECTMAP_VIRT_START    (PML4_ADDR(262))
 #define DIRECTMAP_VIRT_END      (DIRECTMAP_VIRT_START + PML4_ENTRY_BYTES*2)
 
-#define PGT_base_page_table PGT_l4_page_table
+#define PGT_base_page_table     PGT_l4_page_table
 
 #define __HYPERVISOR_CS64 0xe010
 #define __HYPERVISOR_CS32 0xe008
@@ -274,9 +284,9 @@ extern unsigned long _end; /* standard ELF symbol */
     (L2_PAGETABLE_LAST_XEN_SLOT - L2_PAGETABLE_FIRST_XEN_SLOT + 1)
 
 #ifdef CONFIG_X86_PAE
-# define PGT_base_page_table PGT_l3_page_table
+# define PGT_base_page_table     PGT_l3_page_table
 #else
-# define PGT_base_page_table PGT_l2_page_table
+# define PGT_base_page_table     PGT_l2_page_table
 #endif
 
 #define __HYPERVISOR_CS 0xe008
index a0efe89f0aeb5a5dca1571f509689e1d4bb4350b..2ef0775795b43c208c94d1ad9cec90aec6e8030a 100644 (file)
@@ -73,42 +73,42 @@ struct arch_domain
     /* I/O-port admin-specified access capabilities. */
     struct rangeset *ioport_caps;
 
-    /* Shadow mode status and controls. */
-    struct shadow_ops *ops;
-    unsigned int shadow_mode;  /* flags to control shadow table operation */
-    unsigned int shadow_nest;  /* Recursive depth of shadow_lock() nesting */
-
-    /* shadow hashtable */
-    struct shadow_status *shadow_ht;
-    struct shadow_status *shadow_ht_free;
-    struct shadow_status *shadow_ht_extras; /* extra allocation units */
-    unsigned int shadow_extras_count;
-
-    /* shadow dirty bitmap */
+    /* HVM stuff */
+    struct hvm_domain   hvm_domain;
+
+    /* Shadow-translated guest: Pseudophys base address of reserved area. */
+    unsigned long first_reserved_pfn;
+
+    /* Shadow2 stuff */
+    u32               shadow2_mode;  /* flags to control shadow operation */
+    spinlock_t        shadow2_lock;  /* shadow2 domain lock */
+    int               shadow2_locker; /* processor which holds the lock */
+    const char       *shadow2_locker_function; /* Func that took it */
+    struct list_head  shadow2_freelists[SHADOW2_MAX_ORDER + 1]; 
+    struct list_head  shadow2_p2m_freelist;
+    struct list_head  shadow2_p2m_inuse;
+    struct list_head  shadow2_toplevel_shadows;
+    unsigned int      shadow2_total_pages;  /* number of pages allocated */
+    unsigned int      shadow2_free_pages;   /* number of pages on freelists */
+    unsigned int      shadow2_p2m_pages;    /* number of pages in p2m map */
+
+    /* Shadow2 hashtable */
+    struct shadow2_hash_entry *shadow2_hash_table;
+    struct shadow2_hash_entry *shadow2_hash_freelist;
+    struct shadow2_hash_entry *shadow2_hash_allocations;
+    int shadow2_hash_walking;  /* Some function is walking the hash table */
+
+    /* Shadow log-dirty bitmap */
     unsigned long *shadow_dirty_bitmap;
     unsigned int shadow_dirty_bitmap_size;  /* in pages, bit per page */
 
-    /* shadow mode stats */
-    unsigned int shadow_page_count;
-    unsigned int hl2_page_count;
-    unsigned int snapshot_page_count;
-
+    /* Shadow log-dirty mode stats */
     unsigned int shadow_fault_count;
     unsigned int shadow_dirty_count;
 
-    /* full shadow mode */
-    struct out_of_sync_entry *out_of_sync; /* list of out-of-sync pages */
-    struct out_of_sync_entry *out_of_sync_free;
-    struct out_of_sync_entry *out_of_sync_extras;
-    unsigned int out_of_sync_extras_count;
+    /* Shadow translated domain: P2M mapping */
+    pagetable_t phys_table;
 
-    struct list_head free_shadow_frames;
-
-    pagetable_t         phys_table;         /* guest 1:1 pagetable */
-    struct hvm_domain   hvm_domain;
-
-    /* Shadow-translated guest: Pseudophys base address of reserved area. */
-    unsigned long first_reserved_pfn;
 } __cacheline_aligned;
 
 #ifdef CONFIG_X86_PAE
@@ -166,25 +166,34 @@ struct arch_vcpu
      */
     l1_pgentry_t *perdomain_ptes;
 
-    pagetable_t  guest_table_user;      /* x86/64: user-space pagetable. */
-    pagetable_t  guest_table;           /* (MA) guest notion of cr3 */
-    pagetable_t  shadow_table;          /* (MA) shadow of guest */
-    pagetable_t  monitor_table;         /* (MA) used in hypervisor */
-
-    l2_pgentry_t *guest_vtable;         /* virtual address of pagetable */
-    l2_pgentry_t *shadow_vtable;        /* virtual address of shadow_table */
-    l2_pgentry_t *monitor_vtable;              /* virtual address of monitor_table */
-    l1_pgentry_t *hl2_vtable;                  /* virtual address of hl2_table */
-
 #ifdef CONFIG_X86_64
-    l3_pgentry_t *guest_vl3table;
-    l4_pgentry_t *guest_vl4table;
+    pagetable_t guest_table_user;       /* (MFN) x86/64 user-space pagetable */
 #endif
+    pagetable_t guest_table;            /* (MFN) guest notion of cr3 */
+    /* guest_table holds a ref to the page, and also a type-count unless
+     * shadow refcounts are in use */
+    pagetable_t shadow_table;           /* (MFN) shadow of guest */
+    pagetable_t monitor_table;          /* (MFN) hypervisor PT (for HVM) */
+    unsigned long cr3;                     /* (MA) value to install in HW CR3 */
 
-    unsigned long monitor_shadow_ref;
+    void *guest_vtable;                 /* virtual address of pagetable */
+    void *shadow_vtable;                /* virtual address of shadow_table */
+    root_pgentry_t *monitor_vtable;            /* virtual address of monitor_table */
 
     /* Current LDT details. */
     unsigned long shadow_ldt_mapcnt;
+
+    /* Shadow2 stuff */
+    /* -- pointers to mode-specific entry points */
+    struct shadow2_entry_points *shadow2; 
+    unsigned long last_emulated_mfn;    /* last mfn we emulated a write to */
+    u8 shadow2_propagate_fault;         /* emulated fault needs to be */
+                                        /* propagated to guest */
+#if CONFIG_PAGING_LEVELS >= 3
+    u8 shadow2_pae_flip_pending;        /* shadow update requires this PAE cpu
+                                         * to recopy/install its L3 table.
+                                         */
+#endif
 } __cacheline_aligned;
 
 /* shorthands to improve code legibility */
index 5c6600ac7e123861d32e311ccec6b5ae963287d2..277b93ca0c1193cf5062641bcb5d4022dc0c7262 100644 (file)
@@ -31,7 +31,7 @@ int destroy_grant_host_mapping(
 #define gnttab_shared_gmfn(d, t, i)                     \
     (mfn_to_gmfn(d, gnttab_shared_mfn(d, t, i)))
 
-#define gnttab_log_dirty(d, f) mark_dirty((d), (f))
+#define gnttab_mark_dirty(d, f) mark_dirty((d), (f))
 
 static inline void gnttab_clear_flag(unsigned long nr, uint16_t *addr)
 {
index 73f3b3127540e4927dbd2cdc2b4bd78ea5d881ac..cb573e5d9c1617887c737447cca7c7697a96a7a1 100644 (file)
@@ -56,9 +56,16 @@ struct hvm_function_table {
      */
     int (*realmode)(struct vcpu *v);
     int (*paging_enabled)(struct vcpu *v);
+    int (*long_mode_enabled)(struct vcpu *v);
+    int (*guest_x86_mode)(struct vcpu *v);
     int (*instruction_length)(struct vcpu *v);
     unsigned long (*get_guest_ctrl_reg)(struct vcpu *v, unsigned int num);
 
+    /* 
+     * Re-set the value of CR3 that Xen runs on when handling VM exits
+     */
+    void (*update_host_cr3)(struct vcpu *v);
+
     /*
      * Update specifics of the guest state:
      * 1) TS bit in guest cr0 
@@ -133,12 +140,30 @@ hvm_paging_enabled(struct vcpu *v)
     return hvm_funcs.paging_enabled(v);
 }
 
+static inline int
+hvm_long_mode_enabled(struct vcpu *v)
+{
+    return hvm_funcs.long_mode_enabled(v);
+}
+
+static inline int
+hvm_guest_x86_mode(struct vcpu *v)
+{
+    return hvm_funcs.guest_x86_mode(v);
+}
+
 static inline int
 hvm_instruction_length(struct vcpu *v)
 {
     return hvm_funcs.instruction_length(v);
 }
 
+static inline void
+hvm_update_host_cr3(struct vcpu *v)
+{
+    hvm_funcs.update_host_cr3(v);
+}
+
 void hvm_hypercall_page_initialise(struct domain *d,
                                    void *hypercall_page);
 
index 35a0bfe4644ef8af611c3a0b814676d8e1def9bc..6ccfdee6789307c70316d5d8bd67d9b8a0dfba4d 100644 (file)
@@ -116,10 +116,13 @@ enum hval_bitmaps {
 #define DBG_LEVEL_IOAPIC            (1 << 9)
 
 extern unsigned int opt_hvm_debug_level;
-#define HVM_DBG_LOG(level, _f, _a...)           \
-    if ( (level) & opt_hvm_debug_level )        \
-        printk("[HVM:%d.%d] <%s> " _f "\n",     \
-               current->domain->domain_id, current->vcpu_id, __func__, ## _a)
+#define HVM_DBG_LOG(level, _f, _a...)                                         \
+    do {                                                                      \
+        if ( (level) & opt_hvm_debug_level )                                  \
+            printk("[HVM:%d.%d] <%s> " _f "\n",                               \
+                   current->domain->domain_id, current->vcpu_id, __func__,    \
+                   ## _a);                                                    \
+    } while (0)
 #else
 #define HVM_DBG_LOG(level, _f, _a...)
 #endif
index f89b6ad7870a415f97b29bc9eb935517ec2474e3..b607a4578bd606ec2c56e0c8a79cf05cfcdf1725 100644 (file)
@@ -29,6 +29,7 @@
 #define HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI     1
 
 struct hvm_vcpu {
+    unsigned long       hw_cr3;     /* value we give to HW to use */
     unsigned long       ioflags;
     struct hvm_io_op    io_op;
     struct vlapic       *vlapic;
@@ -40,6 +41,11 @@ struct hvm_vcpu {
 
     int                 xen_port;
 
+#if CONFIG_PAGING_LEVELS >= 3
+    l3_pgentry_t hvm_lowmem_l3tab[4]
+    __attribute__((__aligned__(32)));
+#endif
+
     /* Flags */
     int                 flag_dr_dirty;
 
index 85ee7046fd478619b64940dfa84da884567c9592..524411be349fc84bdc9004b85eeccb0896beed9b 100644 (file)
@@ -87,6 +87,7 @@ struct arch_vmx_struct {
 
     unsigned long        cpu_cr0; /* copy of guest CR0 */
     unsigned long        cpu_shadow_cr0; /* copy of guest read shadow CR0 */
+    unsigned long        cpu_shadow_cr4; /* copy of guest read shadow CR4 */
     unsigned long        cpu_cr2; /* save CR2 */
     unsigned long        cpu_cr3;
     unsigned long        cpu_state;
index 38ae0e3b0f866ab7022be2d1f785d6a23637e9a5..38e447259c7e6c1924a3f7aad2d099ba2d3bc4b2 100644 (file)
@@ -298,6 +298,9 @@ static always_inline void __vmwrite_vcpu(
     case GUEST_CR0:
         v->arch.hvm_vmx.cpu_cr0 = value;
         break;
+    case CR4_READ_SHADOW:
+        v->arch.hvm_vmx.cpu_shadow_cr4 = value;
+        break;
     case CPU_BASED_VM_EXEC_CONTROL:
         v->arch.hvm_vmx.cpu_based_exec_control = value;
         break;
@@ -317,11 +320,14 @@ static always_inline void __vmread_vcpu(
     case GUEST_CR0:
         *value = v->arch.hvm_vmx.cpu_cr0;
         break;
+    case CR4_READ_SHADOW:
+        *value = v->arch.hvm_vmx.cpu_shadow_cr4;
+        break;
     case CPU_BASED_VM_EXEC_CONTROL:
         *value = v->arch.hvm_vmx.cpu_based_exec_control;
         break;
     default:
-        printk("__vmread_cpu: invalid field %lx\n", field);
+        printk("__vmread_vcpu: invalid field %lx\n", field);
         break;
     }
 }
@@ -342,6 +348,7 @@ static inline int __vmwrite(unsigned long field, unsigned long value)
     switch ( field ) {
     case CR0_READ_SHADOW:
     case GUEST_CR0:
+    case CR4_READ_SHADOW:
     case CPU_BASED_VM_EXEC_CONTROL:
         __vmwrite_vcpu(v, field, value);
         break;
@@ -404,6 +411,46 @@ static inline int vmx_paging_enabled(struct vcpu *v)
     return (cr0 & X86_CR0_PE) && (cr0 & X86_CR0_PG);
 }
 
+/* Works only for vcpu == current */
+static inline int vmx_long_mode_enabled(struct vcpu *v)
+{
+    ASSERT(v == current);
+    return VMX_LONG_GUEST(current);
+}
+
+/* Works only for vcpu == current */
+static inline int vmx_realmode(struct vcpu *v)
+{
+    unsigned long rflags;
+    ASSERT(v == current);
+
+    __vmread(GUEST_RFLAGS, &rflags);
+    return rflags & X86_EFLAGS_VM;
+}
+
+/* Works only for vcpu == current */
+static inline void vmx_update_host_cr3(struct vcpu *v)
+{
+    ASSERT(v == current);
+    __vmwrite(HOST_CR3, v->arch.cr3);
+}
+
+static inline int vmx_guest_x86_mode(struct vcpu *v)
+{
+    unsigned long cs_ar_bytes;
+    ASSERT(v == current);
+
+    if ( vmx_long_mode_enabled(v) )
+    {
+        __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes);
+        return (cs_ar_bytes & (1u<<13)) ? 8 : 4;
+    }
+    if ( vmx_realmode(v) )
+        return 2;
+    __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes);
+    return (cs_ar_bytes & (1u<<14)) ? 4 : 2;
+}
+
 static inline int vmx_pgbit_test(struct vcpu *v)
 {
     unsigned long cr0;
index 06ea59875454ff99a3f24036b39a6b3ffbd5f210..0b19fbe7ec2628911d8c2d0e46eb7926be1a2647 100644 (file)
 struct page_info
 {
     /* Each frame can be threaded onto a doubly-linked list. */
-    struct list_head list;
+    union {
+        struct list_head list;
+        /* Shadow2 uses this field as an up-pointer in lower-level shadows */
+        paddr_t up;
+    };
 
     /* Reference count and various PGC_xxx flags and fields. */
     u32 count_info;
@@ -46,8 +50,20 @@ struct page_info
 
     } u;
 
-    /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */
-    u32 tlbflush_timestamp;
+    union {
+        /* Timestamp from 'TLB clock', used to reduce need for safety
+         * flushes.  Only valid on a) free pages, and b) guest pages with a
+         * zero type count. */
+        u32 tlbflush_timestamp;
+
+        /* Only used on guest pages with a shadow.
+         * Guest pages with a shadow must have a non-zero type count, so this
+         * does not conflict with the tlbflush timestamp. */
+        u32 shadow2_flags;
+
+        // XXX -- we expect to add another field here, to be used for min/max
+        // purposes, which is only used for shadow pages.
+    };
 };
 
  /* The following page types are MUTUALLY EXCLUSIVE. */
@@ -60,6 +76,7 @@ struct page_info
 #define PGT_ldt_page        (6U<<29) /* using this page in an LDT? */
 #define PGT_writable_page   (7U<<29) /* has writable mappings of this page? */
 
+#ifndef SHADOW2
 #define PGT_l1_shadow       PGT_l1_page_table
 #define PGT_l2_shadow       PGT_l2_page_table
 #define PGT_l3_shadow       PGT_l3_page_table
@@ -69,14 +86,16 @@ struct page_info
 #define PGT_writable_pred   (7U<<29) /* predicted gpfn with writable ref */
 
 #define PGT_fl1_shadow      (5U<<29)
+#endif
+
 #define PGT_type_mask       (7U<<29) /* Bits 29-31. */
 
- /* Has this page been validated for use as its current type? */
-#define _PGT_validated      28
-#define PGT_validated       (1U<<_PGT_validated)
  /* Owning guest has pinned this page to its current type? */
-#define _PGT_pinned         27
+#define _PGT_pinned         28
 #define PGT_pinned          (1U<<_PGT_pinned)
+ /* Has this page been validated for use as its current type? */
+#define _PGT_validated      27
+#define PGT_validated       (1U<<_PGT_validated)
 #if defined(__i386__)
  /* The 11 most significant bits of virt address if this is a page table. */
 #define PGT_va_shift        16
@@ -98,6 +117,7 @@ struct page_info
  /* 16-bit count of uses of this frame as its current type. */
 #define PGT_count_mask      ((1U<<16)-1)
 
+#ifndef SHADOW2
 #ifdef __x86_64__
 #define PGT_high_mfn_shift  52
 #define PGT_high_mfn_mask   (0xfffUL << PGT_high_mfn_shift)
@@ -112,19 +132,53 @@ struct page_info
 #define PGT_score_shift     23
 #define PGT_score_mask      (((1U<<4)-1)<<PGT_score_shift)
 #endif
+#endif /* SHADOW2 */
 
  /* Cleared when the owning guest 'frees' this page. */
 #define _PGC_allocated      31
 #define PGC_allocated       (1U<<_PGC_allocated)
- /* Set when fullshadow mode marks a page out-of-sync */
+ /* Set on a *guest* page to mark it out-of-sync with its shadow */
 #define _PGC_out_of_sync     30
 #define PGC_out_of_sync     (1U<<_PGC_out_of_sync)
- /* Set when fullshadow mode is using a page as a page table */
+ /* Set when is using a page as a page table */
 #define _PGC_page_table      29
 #define PGC_page_table      (1U<<_PGC_page_table)
  /* 29-bit count of references to this frame. */
 #define PGC_count_mask      ((1U<<29)-1)
 
+/* shadow2 uses the count_info on shadow pages somewhat differently */
+/* NB: please coordinate any changes here with the SH2F's in shadow2.h */
+#define PGC_SH2_none           (0U<<28) /* on the shadow2 free list */
+#define PGC_SH2_min_shadow     (1U<<28)
+#define PGC_SH2_l1_32_shadow   (1U<<28) /* shadowing a 32-bit L1 guest page */
+#define PGC_SH2_fl1_32_shadow  (2U<<28) /* L1 shadow for a 32b 4M superpage */
+#define PGC_SH2_l2_32_shadow   (3U<<28) /* shadowing a 32-bit L2 guest page */
+#define PGC_SH2_l1_pae_shadow  (4U<<28) /* shadowing a pae L1 page */
+#define PGC_SH2_fl1_pae_shadow (5U<<28) /* L1 shadow for pae 2M superpg */
+#define PGC_SH2_l2_pae_shadow  (6U<<28) /* shadowing a pae L2-low page */
+#define PGC_SH2_l2h_pae_shadow (7U<<28) /* shadowing a pae L2-high page */
+#define PGC_SH2_l3_pae_shadow  (8U<<28) /* shadowing a pae L3 page */
+#define PGC_SH2_l1_64_shadow   (9U<<28) /* shadowing a 64-bit L1 page */
+#define PGC_SH2_fl1_64_shadow (10U<<28) /* L1 shadow for 64-bit 2M superpg */
+#define PGC_SH2_l2_64_shadow  (11U<<28) /* shadowing a 64-bit L2 page */
+#define PGC_SH2_l3_64_shadow  (12U<<28) /* shadowing a 64-bit L3 page */
+#define PGC_SH2_l4_64_shadow  (13U<<28) /* shadowing a 64-bit L4 page */
+#define PGC_SH2_max_shadow    (13U<<28)
+#define PGC_SH2_p2m_table     (14U<<28) /* in use as the p2m table */
+#define PGC_SH2_monitor_table (15U<<28) /* in use as a monitor table */
+#define PGC_SH2_unused        (15U<<28)
+
+#define PGC_SH2_type_mask     (15U<<28)
+#define PGC_SH2_type_shift          28
+
+#define PGC_SH2_pinned         (1U<<27)
+
+#define _PGC_SH2_log_dirty          26
+#define PGC_SH2_log_dirty      (1U<<26)
+
+/* 26 bit ref count for shadow pages */
+#define PGC_SH2_count_mask    ((1U<<26) - 1)
+
 /* We trust the slab allocator in slab.c, and our use of it. */
 #define PageSlab(page)     (1)
 #define PageSetSlab(page)   ((void)0)
@@ -134,16 +188,24 @@ struct page_info
 
 #if defined(__i386__)
 #define pickle_domptr(_d)   ((u32)(unsigned long)(_d))
-#define unpickle_domptr(_d) ((struct domain *)(unsigned long)(_d))
+static inline struct domain *unpickle_domptr(u32 _domain)
+{ return (_domain & 1) ? NULL : (void *)_domain; }
 #define PRtype_info "08lx" /* should only be used for printk's */
 #elif defined(__x86_64__)
 static inline struct domain *unpickle_domptr(u32 _domain)
-{ return (_domain == 0) ? NULL : __va(_domain); }
+{ return ((_domain == 0) || (_domain & 1)) ? NULL : __va(_domain); }
 static inline u32 pickle_domptr(struct domain *domain)
 { return (domain == NULL) ? 0 : (u32)__pa(domain); }
 #define PRtype_info "016lx"/* should only be used for printk's */
 #endif
 
+/* The order of the largest allocation unit we use for shadow pages */
+#if CONFIG_PAGING_LEVELS == 2
+#define SHADOW2_MAX_ORDER 0 /* Only ever need 4k allocations */
+#else  
+#define SHADOW2_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
+#endif
+
 #define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
 #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
 
@@ -165,7 +227,7 @@ extern void invalidate_shadow_ldt(struct vcpu *d);
 extern int shadow_remove_all_write_access(
     struct domain *d, unsigned long gmfn, unsigned long mfn);
 extern u32 shadow_remove_all_access( struct domain *d, unsigned long gmfn);
-extern int _shadow_mode_refcounts(struct domain *d);
+extern int _shadow2_mode_refcounts(struct domain *d);
 
 static inline void put_page(struct page_info *page)
 {
@@ -197,8 +259,8 @@ static inline int get_page(struct page_info *page,
              unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
              unlikely(d != _domain) )                /* Wrong owner? */
         {
-            if ( !_shadow_mode_refcounts(domain) )
-                DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%"
+            if ( !_shadow2_mode_refcounts(domain) )
+                DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%" 
                         PRtype_info "\n",
                         page_to_mfn(page), domain, unpickle_domptr(d),
                         x, page->u.inuse.type_info);
@@ -254,6 +316,16 @@ static inline int page_is_removable(struct page_info *page)
     ASSERT(((_p)->count_info & PGC_count_mask) != 0);          \
     ASSERT(page_get_owner(_p) == (_d))
 
+// Quick test for whether a given page can be represented directly in CR3.
+//
+#if CONFIG_PAGING_LEVELS == 3
+#define MFN_FITS_IN_CR3(_MFN) !(mfn_x(_MFN) >> 20)
+
+/* returns a lowmem machine address of the copied L3 root table */
+unsigned long
+pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab);
+#endif /* CONFIG_PAGING_LEVELS == 3 */
+
 int check_descriptor(struct desc_struct *d);
 
 /*
@@ -271,29 +343,44 @@ int check_descriptor(struct desc_struct *d);
 #define set_gpfn_from_mfn(mfn, pfn) (machine_to_phys_mapping[(mfn)] = (pfn))
 #define get_gpfn_from_mfn(mfn)      (machine_to_phys_mapping[(mfn)])
 
+
+#define mfn_to_gmfn(_d, mfn)                            \
+    ( (shadow2_mode_translate(_d))                      \
+      ? get_gpfn_from_mfn(mfn)                          \
+      : (mfn) )
+
+#define gmfn_to_mfn(_d, gpfn)  mfn_x(sh2_gfn_to_mfn(_d, gpfn))
+
+
 /*
  * The phys_to_machine_mapping is the reversed mapping of MPT for full
  * virtualization.  It is only used by shadow_mode_translate()==true
  * guests, so we steal the address space that would have normally
  * been used by the read-only MPT map.
  */
-#define phys_to_machine_mapping ((unsigned long *)RO_MPT_VIRT_START)
-#define NR_P2M_TABLE_ENTRIES    ((unsigned long *)RO_MPT_VIRT_END \
-                                 - phys_to_machine_mapping)
+#define phys_to_machine_mapping ((l1_pgentry_t *)RO_MPT_VIRT_START)
 #define INVALID_MFN             (~0UL)
 #define VALID_MFN(_mfn)         (!((_mfn) & (1U<<31)))
 
-#define set_mfn_from_gpfn(pfn, mfn) (phys_to_machine_mapping[(pfn)] = (mfn))
 static inline unsigned long get_mfn_from_gpfn(unsigned long pfn)
 {
-    unsigned long mfn;
+    l1_pgentry_t l1e = l1e_empty();
+    int ret;
+
+#if CONFIG_PAGING_LEVELS > 2
+    if ( pfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof (l1_pgentry_t) ) 
+        /* This pfn is higher than the p2m map can hold */
+        return INVALID_MFN;
+#endif
+
+    ret = __copy_from_user(&l1e,
+                               &phys_to_machine_mapping[pfn],
+                               sizeof(l1e));
 
-    if ( unlikely(pfn >= NR_P2M_TABLE_ENTRIES) ||
-         unlikely(__copy_from_user(&mfn, &phys_to_machine_mapping[pfn],
-                                   sizeof(mfn))) )
-       mfn = INVALID_MFN;
+    if ( (ret == 0) && (l1e_get_flags(l1e) & _PAGE_PRESENT) )
+        return l1e_get_pfn(l1e);
 
-    return mfn;
+    return INVALID_MFN;
 }
 
 #ifdef MEMORY_GUARD
@@ -333,6 +420,7 @@ void audit_domains(void);
 #endif
 
 int new_guest_cr3(unsigned long pfn);
+void make_cr3(struct vcpu *v, unsigned long mfn);
 
 void propagate_page_fault(unsigned long addr, u16 error_code);
 
index f1c08cf500390a5ae2215f910b04fcb0ceae45d5..07c09b2ae2c44e1b16f8c26b0ffdf0b9d6b1ea9b 100644 (file)
@@ -112,6 +112,10 @@ static inline void wrmsrl(unsigned int msr, __u64 val)
 #define MSR_IA32_VMX_EXIT_CTLS_MSR              0x483
 #define MSR_IA32_VMX_ENTRY_CTLS_MSR             0x484
 #define MSR_IA32_VMX_MISC_MSR                   0x485
+#define MSR_IA32_VMX_CR0_FIXED0                 0x486
+#define MSR_IA32_VMX_CR0_FIXED1                 0x487
+#define MSR_IA32_VMX_CR4_FIXED0                 0x488
+#define MSR_IA32_VMX_CR4_FIXED1                 0x489
 #define IA32_FEATURE_CONTROL_MSR                0x3a
 #define IA32_FEATURE_CONTROL_MSR_LOCK           0x1
 #define IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON   0x4
index cf5595b078a2f1884a0676edc67a548dbdb0e022..e93206169ae9a0a922af70592054c3d195e063f9 100644 (file)
@@ -89,15 +89,8 @@ static inline l2_pgentry_32_t l2e_from_paddr_32(paddr_t pa, unsigned int flags)
 
 #define linear_l1_table_32                                                 \
     ((l1_pgentry_32_t *)(LINEAR_PT_VIRT_START))
-#define __linear_l2_table_32                                                 \
-    ((l2_pgentry_32_t *)(LINEAR_PT_VIRT_START +                            \
-                     (LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<0))))
 
 #define linear_pg_table_32 linear_l1_table_32
-#define linear_l2_table_32(_ed) ((_ed)->arch.guest_vtable)
-
-#define va_to_l1mfn_32(_ed, _va) \
-    (l2e_get_pfn(linear_l2_table(_ed)[_va>>L2_PAGETABLE_SHIFT]))
 
 #endif /* __X86_PAGE_GUEST_H__ */
 
index 643240206601694856b6e64642729aea233adea7..94158c7f3d199270ecd20b0ac9ce322e8700a219 100644 (file)
@@ -233,26 +233,18 @@ typedef struct { u64 pfn; } pagetable_t;
      + DOMAIN_ENTRIES_PER_L4_PAGETABLE)
 #endif
 
-#define LINEAR_PT_OFFSET (LINEAR_PT_VIRT_START & VADDR_MASK)
-#define linear_l1_table                                             \
-    ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
-#define __linear_l2_table                                           \
-    ((l2_pgentry_t *)(LINEAR_PT_VIRT_START +                        \
-                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0))))
-#define __linear_l3_table                                           \
-    ((l3_pgentry_t *)(LINEAR_PT_VIRT_START +                        \
-                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)) +   \
-                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<1))))
-#define __linear_l4_table                                           \
-    ((l4_pgentry_t *)(LINEAR_PT_VIRT_START +                        \
-                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)) +   \
-                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<1)) +   \
-                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<2))))
-
+/* Where to find each level of the linear mapping */
+#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
+#define __linear_l2_table \
+ ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l3_table \
+ ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l4_table \
+ ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
+
+#define linear_l1_table __linear_l1_table
 #define linear_pg_table linear_l1_table
-#define linear_l2_table(v) ((v)->arch.guest_vtable)
-#define linear_l3_table(v) ((v)->arch.guest_vl3table)
-#define linear_l4_table(v) ((v)->arch.guest_vl4table)
+#define linear_l2_table(v) ((l2_pgentry_t *)(v)->arch.guest_vtable)
 
 #ifndef __ASSEMBLY__
 #if CONFIG_PAGING_LEVELS == 3
@@ -294,6 +286,7 @@ extern void paging_init(void);
 #define _PAGE_AVAIL1   0x400U
 #define _PAGE_AVAIL2   0x800U
 #define _PAGE_AVAIL    0xE00U
+#define _PAGE_PSE_PAT 0x1000U
 
 /*
  * Debug option: Ensure that granted mappings are not implicitly unmapped.
@@ -307,9 +300,9 @@ extern void paging_init(void);
 #endif
 
 /*
- * Disallow unused flag bits plus PAT, PSE and GLOBAL. Also disallow GNTTAB
- * if we are using it for grant-table debugging. Permit the NX bit if the
- * hardware supports it.
+ * Disallow unused flag bits plus PAT, PSE and GLOBAL.
+ * Also disallow GNTTAB if we are using it for grant-table debugging.
+ * Permit the NX bit if the hardware supports it.
  */
 #define BASE_DISALLOW_MASK ((0xFFFFF180U | _PAGE_GNTTAB) & ~_PAGE_NX)
 
index 54bc01ea7c6a7224df6e2c06b1be299bf1254b61..d6e24b207d9e4ce14cc1256bd36cb62f1b9928a5 100644 (file)
@@ -144,4 +144,57 @@ PERFCOUNTER_CPU(remove_write_predicted, "remove_write predict hit&exit")
 PERFCOUNTER_CPU(remove_write_bad_prediction, "remove_write bad prediction")
 PERFCOUNTER_CPU(update_hl2e_invlpg,     "update_hl2e calls invlpg")
 
+/* Shadow2 counters */
+PERFCOUNTER_CPU(shadow2_alloc,          "calls to shadow2_alloc")
+PERFCOUNTER_CPU(shadow2_alloc_tlbflush, "shadow2_alloc flushed TLBs")
+PERFSTATUS(shadow2_alloc_count,         "number of shadow pages in use")
+PERFCOUNTER_CPU(shadow2_free,           "calls to shadow2_free")
+PERFCOUNTER_CPU(shadow2_prealloc_1,     "shadow2 recycles old shadows")
+PERFCOUNTER_CPU(shadow2_prealloc_2,     "shadow2 recycles in-use shadows")
+PERFCOUNTER_CPU(shadow2_linear_map_failed, "shadow2 hit read-only linear map")
+PERFCOUNTER_CPU(shadow2_a_update,       "shadow2 A bit update")
+PERFCOUNTER_CPU(shadow2_ad_update,      "shadow2 A&D bit update")
+PERFCOUNTER_CPU(shadow2_fault,          "calls to shadow2_fault")
+PERFCOUNTER_CPU(shadow2_fault_bail_bad_gfn, "shadow2_fault guest bad gfn")
+PERFCOUNTER_CPU(shadow2_fault_bail_not_present, 
+                                        "shadow2_fault guest not-present")
+PERFCOUNTER_CPU(shadow2_fault_bail_nx,  "shadow2_fault guest NX fault")
+PERFCOUNTER_CPU(shadow2_fault_bail_ro_mapping, "shadow2_fault guest R/W fault")
+PERFCOUNTER_CPU(shadow2_fault_bail_user_supervisor, 
+                                        "shadow2_fault guest U/S fault")
+PERFCOUNTER_CPU(shadow2_fault_emulate_read, "shadow2_fault emulates a read")
+PERFCOUNTER_CPU(shadow2_fault_emulate_write, "shadow2_fault emulates a write")
+PERFCOUNTER_CPU(shadow2_fault_emulate_failed, "shadow2_fault emulator fails")
+PERFCOUNTER_CPU(shadow2_fault_mmio,     "shadow2_fault handled as mmio")
+PERFCOUNTER_CPU(shadow2_fault_fixed,    "shadow2_fault fixed fault")
+PERFCOUNTER_CPU(shadow2_ptwr_emulate,   "shadow2 causes ptwr to emulate")
+PERFCOUNTER_CPU(shadow2_validate_gl1e_calls, "calls to shadow2_validate_gl1e")
+PERFCOUNTER_CPU(shadow2_validate_gl2e_calls, "calls to shadow2_validate_gl2e")
+PERFCOUNTER_CPU(shadow2_validate_gl3e_calls, "calls to shadow2_validate_gl3e")
+PERFCOUNTER_CPU(shadow2_validate_gl4e_calls, "calls to shadow2_validate_gl4e")
+PERFCOUNTER_CPU(shadow2_hash_lookups,   "calls to shadow2_hash_lookup")
+PERFCOUNTER_CPU(shadow2_hash_lookup_head, "shadow2 hash hit in bucket head")
+PERFCOUNTER_CPU(shadow2_hash_lookup_miss, "shadow2 hash misses")
+PERFCOUNTER_CPU(shadow2_get_shadow_status, "calls to get_shadow_status")
+PERFCOUNTER_CPU(shadow2_hash_inserts,   "calls to shadow2_hash_insert")
+PERFCOUNTER_CPU(shadow2_hash_deletes,   "calls to shadow2_hash_delete")
+PERFCOUNTER_CPU(shadow2_writeable,      "shadow2 removes write access")
+PERFCOUNTER_CPU(shadow2_writeable_h_1,  "shadow2 writeable: 32b w2k3")
+PERFCOUNTER_CPU(shadow2_writeable_h_2,  "shadow2 writeable: 32pae w2k3")
+PERFCOUNTER_CPU(shadow2_writeable_h_3,  "shadow2 writeable: 64b w2k3")
+PERFCOUNTER_CPU(shadow2_writeable_h_4,  "shadow2 writeable: 32b linux low")
+PERFCOUNTER_CPU(shadow2_writeable_bf,   "shadow2 writeable brute-force")
+PERFCOUNTER_CPU(shadow2_mappings,       "shadow2 removes all mappings")
+PERFCOUNTER_CPU(shadow2_mappings_bf,    "shadow2 rm-mappings brute-force")
+PERFCOUNTER_CPU(shadow2_early_unshadow, "shadow2 unshadows for fork/exit")
+PERFCOUNTER_CPU(shadow2_early_unshadow_top, "shadow2 unhooks for fork/exit")
+PERFCOUNTER_CPU(shadow2_unshadow,       "shadow2 unshadows a page")
+PERFCOUNTER_CPU(shadow2_up_pointer,     "shadow2 unshadow by up-pointer")
+PERFCOUNTER_CPU(shadow2_unshadow_bf,    "shadow2 unshadow brute-force")
+PERFCOUNTER_CPU(shadow2_get_page_fail,  "shadow2_get_page_from_l1e failed")
+PERFCOUNTER_CPU(shadow2_guest_walk,     "shadow2 walks guest tables")
+PERFCOUNTER_CPU(shadow2_walk_cache_hit, "shadow2 walk-cache hits")
+PERFCOUNTER_CPU(shadow2_walk_cache_miss, "shadow2 walk-cache misses")
+
+
 /*#endif*/ /* __XEN_PERFC_DEFN_H__ */
index d460544d3eba1f675a82bac108e7671dda69c3b0..81c8757f8e5aff8a7a587d551870cad5571ea755 100644 (file)
@@ -545,6 +545,7 @@ extern always_inline void prefetchw(const void *x)
 #endif
 
 void show_stack(struct cpu_user_regs *regs);
+void show_xen_trace(void);
 void show_stack_overflow(unsigned long esp);
 void show_registers(struct cpu_user_regs *regs);
 void show_execution_state(struct cpu_user_regs *regs);
index 7144b24d8bbd590bb460ac65fedc441f608ab232..efade3021ca5527f76f43c2f0b65b2389736bc56 100644 (file)
@@ -1,8 +1,7 @@
 /******************************************************************************
  * include/asm-x86/shadow.h
  * 
- * Copyright (c) 2005 Michael A Fetterman
- * Based on an earlier implementation by Ian Pratt et al
+ * Copyright (c) 2006 by XenSource Inc.
  * 
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
 #ifndef _XEN_SHADOW_H
 #define _XEN_SHADOW_H
 
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/perfc.h>
-#include <xen/sched.h>
-#include <xen/mm.h>
-#include <xen/domain_page.h>
-#include <asm/current.h>
-#include <asm/flushtlb.h>
-#include <asm/processor.h>
-#include <asm/hvm/hvm.h>
-#include <asm/hvm/support.h>
-#include <asm/regs.h>
-#include <public/dom0_ops.h>
-#include <asm/shadow_public.h>
-#include <asm/page-guest32.h>
-#include <asm/shadow_ops.h>
+/* This file is just a wrapper around the new Shadow2 header,
+ * providing names that must be defined in any shadow implementation. */
 
-/* Shadow PT operation mode : shadow-mode variable in arch_domain. */
+#include <asm/shadow2.h>
 
-#define SHM_enable    (1<<0) /* we're in one of the shadow modes */
-#define SHM_refcounts (1<<1) /* refcounts based on shadow tables instead of
-                                guest tables */
-#define SHM_write_all (1<<2) /* allow write access to all guest pt pages,
-                                regardless of pte write permissions */
-#define SHM_log_dirty (1<<3) /* enable log dirty mode */
-#define SHM_translate (1<<4) /* Xen does p2m translation, not guest */
-#define SHM_external  (1<<5) /* Xen does not steal address space from the
-                                domain for its own booking; requires VT or
-                                similar mechanisms */
-#define SHM_wr_pt_pte (1<<6) /* guest allowed to set PAGE_RW bit in PTEs which
-                                point to page table pages. */
+/* How to make sure a page is not referred to in a shadow PT */
+/* This will need to be a for_each_vcpu if we go to per-vcpu shadows */ 
+#define shadow_drop_references(_d, _p)                      \
+    shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p)))
+#define shadow_sync_and_drop_references(_d, _p)             \
+    shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p)))
 
-#define shadow_mode_enabled(_d)   ((_d)->arch.shadow_mode)
-#define shadow_mode_refcounts(_d) ((_d)->arch.shadow_mode & SHM_refcounts)
-#define shadow_mode_write_l1(_d)  (VM_ASSIST(_d, VMASST_TYPE_writable_pagetables))
-#define shadow_mode_write_all(_d) ((_d)->arch.shadow_mode & SHM_write_all)
-#define shadow_mode_log_dirty(_d) ((_d)->arch.shadow_mode & SHM_log_dirty)
-#define shadow_mode_translate(_d) ((_d)->arch.shadow_mode & SHM_translate)
-#define shadow_mode_external(_d)  ((_d)->arch.shadow_mode & SHM_external)
-#define shadow_mode_wr_pt_pte(_d) ((_d)->arch.shadow_mode & SHM_wr_pt_pte)
+/* Whether we are translating the domain's frame numbers for it */
+#define shadow_mode_translate(d)  shadow2_mode_translate(d)
 
-#define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START)
-#define __shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START + \
-     (SH_LINEAR_PT_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT))))
-#define shadow_linear_l2_table(_v) ((_v)->arch.shadow_vtable)
+/* ...and  if so, how to add and remove entries in the mapping */
+#define guest_physmap_add_page(_d, _p, _m)                  \
+    shadow2_guest_physmap_add_page((_d), (_p), (_m))
+#define guest_physmap_remove_page(_d, _p, _m   )            \
+    shadow2_guest_physmap_remove_page((_d), (_p), (_m))
 
-// easy access to the hl2 table (for translated but not external modes only)
-#define __linear_hl2_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START + \
-     (PERDOMAIN_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT))))
-
-/*
- * For now we use the per-domain BIGLOCK rather than a shadow-specific lock.
- * We usually have the BIGLOCK already acquired anyway, so this is unlikely
- * to cause much unnecessary extra serialisation. Also it's a recursive
- * lock, and there are some code paths containing nested shadow_lock().
- * The #if0'ed code below is therefore broken until such nesting is removed.
- */
-#if 0
-#define shadow_lock_init(_d)                    \
-    spin_lock_init(&(_d)->arch.shadow_lock)
-#define shadow_lock_is_acquired(_d)             \
-    spin_is_locked(&(_d)->arch.shadow_lock)
-#define shadow_lock(_d)                         \
-do {                                            \
-    ASSERT(!shadow_lock_is_acquired(_d));       \
-    spin_lock(&(_d)->arch.shadow_lock);         \
-} while (0)
-#define shadow_unlock(_d)                       \
-do {                                            \
-    ASSERT(!shadow_lock_is_acquired(_d));       \
-    spin_unlock(&(_d)->arch.shadow_lock);       \
-} while (0)
-#else
-#define shadow_lock_init(_d)                    \
-    ((_d)->arch.shadow_nest = 0)
-#define shadow_lock_is_acquired(_d)             \
-    (spin_is_locked(&(_d)->big_lock) && ((_d)->arch.shadow_nest != 0))
-#define shadow_lock(_d)                         \
-do {                                            \
-    LOCK_BIGLOCK(_d);                           \
-    (_d)->arch.shadow_nest++;                   \
-} while (0)
-#define shadow_unlock(_d)                       \
-do {                                            \
-    ASSERT(shadow_lock_is_acquired(_d));        \
-    (_d)->arch.shadow_nest--;                   \
-    UNLOCK_BIGLOCK(_d);                         \
-} while (0)
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 3
-static inline u64 get_cr3_idxval(struct vcpu *v)
-{
-    u64 pae_cr3;
-
-    if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 &&
-            !shadow_mode_log_dirty(v->domain) )
-    {
-        pae_cr3 = hvm_get_guest_ctrl_reg(v, 3); /* get CR3 */
-        return (pae_cr3 >> PAE_CR3_ALIGN) & PAE_CR3_IDX_MASK;
-    }
-    else
-        return 0;
-}
-
-#define shadow_key_t u64
-#define index_to_key(x) ((x) << 32)
-#else
-#define get_cr3_idxval(v) (0)
-#define shadow_key_t unsigned long
-#define index_to_key(x)  (0)
-#endif
-
-
-#define SHADOW_ENCODE_MIN_MAX(_min, _max) ((((GUEST_L1_PAGETABLE_ENTRIES - 1) - (_max)) << 16) | (_min))
-#define SHADOW_MIN(_encoded) ((_encoded) & ((1u<<16) - 1))
-#define SHADOW_MAX(_encoded) ((GUEST_L1_PAGETABLE_ENTRIES - 1) - ((_encoded) >> 16))
-extern void shadow_direct_map_clean(struct domain *d);
-extern int shadow_direct_map_init(struct domain *d);
-extern int shadow_direct_map_fault(
-    unsigned long vpa, struct cpu_user_regs *regs);
-extern void shadow_mode_init(void);
-extern int shadow_mode_control(struct domain *p, dom0_shadow_control_t *sc);
-extern int shadow_fault(unsigned long va, struct cpu_user_regs *regs);
-extern int shadow_mode_enable(struct domain *p, unsigned int mode);
-extern void shadow_invlpg(struct vcpu *, unsigned long);
-extern struct out_of_sync_entry *shadow_mark_mfn_out_of_sync(
-    struct vcpu *v, unsigned long gpfn, unsigned long mfn);
-extern void free_monitor_pagetable(struct vcpu *v);
-extern void __shadow_sync_all(struct domain *d);
-extern int __shadow_out_of_sync(struct vcpu *v, unsigned long va);
-extern int set_p2m_entry(
-    struct domain *d, unsigned long pfn, unsigned long mfn,
-    struct domain_mmap_cache *l2cache,
-    struct domain_mmap_cache *l1cache);
-extern void remove_shadow(struct domain *d, unsigned long gpfn, u32 stype);
-
-extern void free_shadow_page(unsigned long smfn);
-
-extern void shadow_l1_normal_pt_update(struct domain *d,
-                                       paddr_t pa, l1_pgentry_t l1e,
-                                       struct domain_mmap_cache *cache);
-extern void shadow_l2_normal_pt_update(struct domain *d,
-                                       paddr_t pa, l2_pgentry_t l2e,
-                                       struct domain_mmap_cache *cache);
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/page-guest32.h>
-/*
- * va_mask cannot be used because it's used by the shadow hash.
- * Use the score area for for now.
- */
-#define is_xen_l2_slot(t,s)                                                    \
-    ( ((((t) & PGT_score_mask) >> PGT_score_shift) == 3) &&                    \
-      ((s) >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES - 1))) )
-
-extern unsigned long gva_to_gpa(unsigned long gva);
-extern void shadow_l3_normal_pt_update(struct domain *d,
-                                       paddr_t pa, l3_pgentry_t l3e,
-                                       struct domain_mmap_cache *cache);
-#endif
-#if CONFIG_PAGING_LEVELS >= 4
-extern void shadow_l4_normal_pt_update(struct domain *d,
-                                       paddr_t pa, l4_pgentry_t l4e,
-                                       struct domain_mmap_cache *cache);
-#endif
-extern int shadow_do_update_va_mapping(unsigned long va,
-                                       l1_pgentry_t val,
-                                       struct vcpu *v);
-
-
-static inline unsigned long __shadow_status(
-    struct domain *d, unsigned long gpfn, unsigned long stype);
-
-#if CONFIG_PAGING_LEVELS <= 2
-static inline void update_hl2e(struct vcpu *v, unsigned long va);
-#endif
-
-static inline int page_is_page_table(struct page_info *page)
-{
-    struct domain *owner = page_get_owner(page);
-    u32 type_info;
-
-    if ( owner && shadow_mode_refcounts(owner) )
-        return page->count_info & PGC_page_table;
-
-    type_info = page->u.inuse.type_info & PGT_type_mask;
-    return type_info && (type_info <= PGT_l4_page_table);
-}
-
-static inline int mfn_is_page_table(unsigned long mfn)
-{
-    if ( !mfn_valid(mfn) )
-        return 0;
-
-    return page_is_page_table(mfn_to_page(mfn));
-}
-
-static inline int page_out_of_sync(struct page_info *page)
-{
-    return page->count_info & PGC_out_of_sync;
-}
-
-static inline int mfn_out_of_sync(unsigned long mfn)
-{
-    if ( !mfn_valid(mfn) )
-        return 0;
-
-    return page_out_of_sync(mfn_to_page(mfn));
-}
-
-
-/************************************************************************/
-
-static void inline
-__shadow_sync_mfn(struct domain *d, unsigned long mfn)
-{
-    if ( d->arch.out_of_sync )
-    {
-        // XXX - could be smarter
-        //
-        __shadow_sync_all(d);
-    }
-}
-
-static void inline
-__shadow_sync_va(struct vcpu *v, unsigned long va)
-{
-    struct domain *d = v->domain;
-
-    if ( d->arch.out_of_sync && __shadow_out_of_sync(v, va) )
-    {
-        perfc_incrc(shadow_sync_va);
-
-        // XXX - could be smarter
-        //
-        __shadow_sync_all(v->domain);
-    }
-#if CONFIG_PAGING_LEVELS <= 2
-    // Also make sure the HL2 is up-to-date for this address.
-    //
-    if ( unlikely(shadow_mode_translate(v->domain)) )
-        update_hl2e(v, va);
-#endif
-}
-
-static void inline
-shadow_sync_all(struct domain *d)
-{
-    if ( unlikely(shadow_mode_enabled(d)) )
-    {
-        shadow_lock(d);
-
-        if ( d->arch.out_of_sync )
-            __shadow_sync_all(d);
-
-        ASSERT(d->arch.out_of_sync == NULL);
-
-        shadow_unlock(d);
-    }
-}
-
-// SMP BUG: This routine can't ever be used properly in an SMP context.
-//          It should be something like get_shadow_and_sync_va().
-//          This probably shouldn't exist.
-//
-static void inline
-shadow_sync_va(struct vcpu *v, unsigned long gva)
-{
-    struct domain *d = v->domain;
-    if ( unlikely(shadow_mode_enabled(d)) )
-    {
-        shadow_lock(d);
-        __shadow_sync_va(v, gva);
-        shadow_unlock(d);
-    }
-}
-
-extern void __shadow_mode_disable(struct domain *d);
-static inline void shadow_mode_disable(struct domain *d)
-{
-    if ( unlikely(shadow_mode_enabled(d)) )
-    {
-        shadow_lock(d);
-        __shadow_mode_disable(d);
-        shadow_unlock(d);
-    }
-}
-
-/************************************************************************/
-
-#define mfn_to_gmfn(_d, mfn)                         \
-    ( (shadow_mode_translate(_d))                      \
-      ? get_gpfn_from_mfn(mfn)                          \
-      : (mfn) )
-
-#define gmfn_to_mfn(_d, gpfn)                        \
-    ({                                                 \
-        unlikely(shadow_mode_translate(_d))            \
-        ? (likely(current->domain == (_d))             \
-           ? get_mfn_from_gpfn(gpfn)                    \
-           : get_mfn_from_gpfn_foreign(_d, gpfn))       \
-        : (gpfn);                                      \
-    })
-
-extern unsigned long get_mfn_from_gpfn_foreign(
-    struct domain *d, unsigned long gpfn);
-
-/************************************************************************/
-
-struct shadow_status {
-    struct shadow_status *next;   /* Pull-to-front list per hash bucket. */
-    shadow_key_t  gpfn_and_flags; /* Guest pfn plus flags. */
-    unsigned long smfn;           /* Shadow mfn.           */
-};
-
-#define shadow_ht_extra_size 128
-#define shadow_ht_buckets    256
-
-struct out_of_sync_entry {
-    struct out_of_sync_entry *next;
-    struct vcpu   *v;
-    unsigned long gpfn;    /* why is this here? */
-    unsigned long gmfn;
-    unsigned long snapshot_mfn;
-    paddr_t writable_pl1e; /* NB: this is a machine address */
-    unsigned long va;
-};
-
-#define out_of_sync_extra_size 127
-
-#define SHADOW_SNAPSHOT_ELSEWHERE (-1L)
-
-/************************************************************************/
-#define SHADOW_DEBUG 0
-#define SHADOW_VERBOSE_DEBUG 0
-#define SHADOW_VVERBOSE_DEBUG 0
-#define SHADOW_VVVERBOSE_DEBUG 0
-#define SHADOW_HASH_DEBUG 0
-#define FULLSHADOW_DEBUG 0
-
-#if SHADOW_DEBUG
-extern int shadow_status_noswap;
-#define SHADOW_REFLECTS_SNAPSHOT _PAGE_AVAIL0
-#endif
-
-#if SHADOW_VERBOSE_DEBUG
-#define SH_LOG(_f, _a...)                                               \
-    printk("DOM%uP%u: SH_LOG(%d): " _f "\n",                            \
-       current->domain->domain_id , smp_processor_id(), __LINE__ , ## _a )
-#define SH_VLOG(_f, _a...)                                              \
-    printk("DOM%uP%u: SH_VLOG(%d): " _f "\n",                           \
-           current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define SH_LOG(_f, _a...) ((void)0)
-#define SH_VLOG(_f, _a...) ((void)0)
-#endif
-
-#if SHADOW_VVERBOSE_DEBUG
-#define SH_VVLOG(_f, _a...)                                             \
-    printk("DOM%uP%u: SH_VVLOG(%d): " _f "\n",                          \
-           current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define SH_VVLOG(_f, _a...) ((void)0)
-#endif
-
-#if SHADOW_VVVERBOSE_DEBUG
-#define SH_VVVLOG(_f, _a...)                                            \
-    printk("DOM%uP%u: SH_VVVLOG(%d): " _f "\n",                         \
-           current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define SH_VVVLOG(_f, _a...) ((void)0)
-#endif
-
-#if FULLSHADOW_DEBUG
-#define FSH_LOG(_f, _a...)                                              \
-    printk("DOM%uP%u: FSH_LOG(%d): " _f "\n",                           \
-           current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define FSH_LOG(_f, _a...) ((void)0)
-#endif
-
-
-/************************************************************************/
-
-static inline int
-shadow_get_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
-{
-    l1_pgentry_t nl1e;
-    int res;
-    unsigned long mfn;
-    struct domain *owner;
-
-    ASSERT(l1e_get_flags(l1e) & _PAGE_PRESENT);
-
-    if ( !shadow_mode_refcounts(d) )
-        return 1;
-
-    nl1e = l1e;
-    l1e_remove_flags(nl1e, _PAGE_GLOBAL);
-
-    if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
-        return 0;
-
-    res = get_page_from_l1e(nl1e, d);
-
-    if ( unlikely(!res) && IS_PRIV(d) && !shadow_mode_translate(d) &&
-         !(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) &&
-         (mfn = l1e_get_pfn(nl1e)) &&
-         mfn_valid(mfn) &&
-         (owner = page_get_owner(mfn_to_page(mfn))) &&
-         (d != owner) )
-    {
-        res = get_page_from_l1e(nl1e, owner);
-        printk("tried to map mfn %lx from domain %d into shadow page tables "
-               "of domain %d; %s\n",
-               mfn, owner->domain_id, d->domain_id,
-               res ? "success" : "failed");
-    }
-
-    if ( unlikely(!res) )
-    {
-        perfc_incrc(shadow_get_page_fail);
-        FSH_LOG("%s failed to get ref l1e=%" PRIpte "\n",
-                __func__, l1e_get_intpte(l1e));
-    }
-
-    return res;
-}
-
-static inline void
-shadow_put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
-{
-    if ( !shadow_mode_refcounts(d) )
-        return;
-
-    put_page_from_l1e(l1e, d);
-}
-
-static inline void
-shadow_put_page_type(struct domain *d, struct page_info *page)
-{
-    if ( !shadow_mode_refcounts(d) )
-        return;
-
-    put_page_type(page);
-}
-
-static inline int shadow_get_page(struct domain *d,
-                                  struct page_info *page,
-                                  struct domain *owner)
-{
-    if ( !shadow_mode_refcounts(d) )
-        return 1;
-    return get_page(page, owner);
-}
-
-static inline void shadow_put_page(struct domain *d,
-                                   struct page_info *page)
-{
-    if ( !shadow_mode_refcounts(d) )
-        return;
-    put_page(page);
-}
-
-/************************************************************************/
-
-static inline void __mark_dirty(struct domain *d, unsigned long mfn)
-{
-    unsigned long pfn;
-
-    ASSERT(shadow_lock_is_acquired(d));
-
-    if ( likely(!shadow_mode_log_dirty(d)) || !VALID_MFN(mfn) )
-        return;
-
-    ASSERT(d->arch.shadow_dirty_bitmap != NULL);
-
-    /* We /really/ mean PFN here, even for non-translated guests. */
-    pfn = get_gpfn_from_mfn(mfn);
-
-    /*
-     * Values with the MSB set denote MFNs that aren't really part of the 
-     * domain's pseudo-physical memory map (e.g., the shared info frame).
-     * Nothing to do here...
-     */
-    if ( unlikely(IS_INVALID_M2P_ENTRY(pfn)) )
-        return;
-
-    /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
-    if ( likely(pfn < d->arch.shadow_dirty_bitmap_size) &&
-         !__test_and_set_bit(pfn, d->arch.shadow_dirty_bitmap) )
-    {
-        d->arch.shadow_dirty_count++;
-    }
-#ifndef NDEBUG
-    else if ( mfn_valid(mfn) )
-    {
-        SH_VLOG("mark_dirty OOR! mfn=%lx pfn=%lx max=%x (dom %p)",
-               mfn, pfn, d->arch.shadow_dirty_bitmap_size, d);
-        SH_VLOG("dom=%p caf=%08x taf=%" PRtype_info, 
-                page_get_owner(mfn_to_page(mfn)),
-                mfn_to_page(mfn)->count_info, 
-                mfn_to_page(mfn)->u.inuse.type_info );
-    }
-#endif
-}
-
-
-static inline void mark_dirty(struct domain *d, unsigned int mfn)
-{
-    if ( unlikely(shadow_mode_log_dirty(d)) )
-    {
-        shadow_lock(d);
-        __mark_dirty(d, mfn);
-        shadow_unlock(d);
-    }
-}
-
-
-/************************************************************************/
-#if CONFIG_PAGING_LEVELS <= 2
-static inline void
-__shadow_get_l2e(
-    struct vcpu *v, unsigned long va, l2_pgentry_t *psl2e)
-{
-    ASSERT(shadow_mode_enabled(v->domain));
-
-    *psl2e = v->arch.shadow_vtable[l2_table_offset(va)];
-}
-
-static inline void
-__shadow_set_l2e(
-    struct vcpu *v, unsigned long va, l2_pgentry_t value)
-{
-    ASSERT(shadow_mode_enabled(v->domain));
-
-    v->arch.shadow_vtable[l2_table_offset(va)] = value;
-}
-
-static inline void
-__guest_get_l2e(
-    struct vcpu *v, unsigned long va, l2_pgentry_t *pl2e)
-{
-    *pl2e = v->arch.guest_vtable[l2_table_offset(va)];
-}
-
-static inline void
-__guest_set_l2e(
-    struct vcpu *v, unsigned long va, l2_pgentry_t value)
-{
-    struct domain *d = v->domain;
-
-    v->arch.guest_vtable[l2_table_offset(va)] = value;
-
-    if ( unlikely(shadow_mode_translate(d)) )
-        update_hl2e(v, va);
-
-    __mark_dirty(d, pagetable_get_pfn(v->arch.guest_table));
-}
-
-static inline void
-__direct_get_l2e(
-    struct vcpu *v, unsigned long va, l2_pgentry_t *psl2e)
-{
-    l2_pgentry_t *phys_vtable;
-
-    ASSERT(shadow_mode_enabled(v->domain));
-
-    phys_vtable = map_domain_page(
-        pagetable_get_pfn(v->domain->arch.phys_table));
-
-    *psl2e = phys_vtable[l2_table_offset(va)];
-
-    unmap_domain_page(phys_vtable);
-}
-
-static inline void
-__direct_set_l2e(
-    struct vcpu *v, unsigned long va, l2_pgentry_t value)
-{
-    l2_pgentry_t *phys_vtable;
-
-    ASSERT(shadow_mode_enabled(v->domain));
-
-    phys_vtable = map_domain_page(
-        pagetable_get_pfn(v->domain->arch.phys_table));
-
-    phys_vtable[l2_table_offset(va)] = value;
-
-    unmap_domain_page(phys_vtable);
-}
-
-static inline void
-update_hl2e(struct vcpu *v, unsigned long va)
-{
-    int index = l2_table_offset(va);
-    unsigned long mfn;
-    l2_pgentry_t gl2e = v->arch.guest_vtable[index];
-    l1_pgentry_t old_hl2e, new_hl2e;
-    int need_flush = 0;
-
-    ASSERT(shadow_mode_translate(v->domain));
-
-    old_hl2e = v->arch.hl2_vtable[index];
-
-    if ( (l2e_get_flags(gl2e) & _PAGE_PRESENT) &&
-         VALID_MFN(mfn = get_mfn_from_gpfn(l2e_get_pfn(gl2e))) )
-        new_hl2e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
-    else
-        new_hl2e = l1e_empty();
-
-    // only do the ref counting if something has changed.
-    //
-    if ( (l1e_has_changed(old_hl2e, new_hl2e, PAGE_FLAG_MASK)) )
-    {
-        if ( (l1e_get_flags(new_hl2e) & _PAGE_PRESENT) &&
-             !shadow_get_page(v->domain, mfn_to_page(l1e_get_pfn(new_hl2e)),
-                              v->domain) )
-            new_hl2e = l1e_empty();
-        if ( l1e_get_flags(old_hl2e) & _PAGE_PRESENT )
-        {
-            shadow_put_page(v->domain, mfn_to_page(l1e_get_pfn(old_hl2e)));
-            need_flush = 1;
-        }
-
-        v->arch.hl2_vtable[l2_table_offset(va)] = new_hl2e;
-
-        if ( need_flush )
-        {
-            perfc_incrc(update_hl2e_invlpg);
-            flush_tlb_one_mask(v->domain->domain_dirty_cpumask,
-                               &linear_pg_table[l1_linear_offset(va)]);
-        }
-    }
-}
-
-static inline void shadow_drop_references(
-    struct domain *d, struct page_info *page)
-{
-    if ( likely(!shadow_mode_refcounts(d)) ||
-         ((page->u.inuse.type_info & PGT_count_mask) == 0) )
-        return;
-
-    /* XXX This needs more thought... */
-    printk("%s: needing to call shadow_remove_all_access for mfn=%lx\n",
-           __func__, page_to_mfn(page));
-    printk("Before: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page),
-           page->count_info, page->u.inuse.type_info);
-
-    shadow_lock(d);
-    shadow_remove_all_access(d, page_to_mfn(page));
-    shadow_unlock(d);
-
-    printk("After:  mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page),
-           page->count_info, page->u.inuse.type_info);
-}
-
-/* XXX Needs more thought. Neither pretty nor fast: a place holder. */
-static inline void shadow_sync_and_drop_references(
-    struct domain *d, struct page_info *page)
-{
-    if ( likely(!shadow_mode_refcounts(d)) )
-        return;
-
-    if ( page_out_of_sync(page) )
-        __shadow_sync_mfn(d, page_to_mfn(page));
-
-    shadow_remove_all_access(d, page_to_mfn(page));
-}
-#endif
-
-/************************************************************************/
-
-/*
- * Add another shadow reference to smfn.
- */
-static inline int
-get_shadow_ref(unsigned long smfn)
-{
-    u32 x, nx;
-
-    ASSERT(mfn_valid(smfn));
-
-    x = mfn_to_page(smfn)->count_info;
-    nx = x + 1;
-
-    if ( unlikely(nx == 0) )
-    {
-        printk("get_shadow_ref overflow, gmfn=%" PRtype_info  " smfn=%lx\n",
-               mfn_to_page(smfn)->u.inuse.type_info & PGT_mfn_mask,
-               smfn);
-        BUG();
-    }
-    
-    // Guarded by the shadow lock...
-    //
-    mfn_to_page(smfn)->count_info = nx;
-
-    return 1;
-}
-
-/*
- * Drop a shadow reference to smfn.
- */
-static inline void
-put_shadow_ref(unsigned long smfn)
-{
-    u32 x, nx;
-
-    ASSERT(mfn_valid(smfn));
-
-    x = mfn_to_page(smfn)->count_info;
-    nx = x - 1;
-
-    if ( unlikely(x == 0) )
-    {
-        printk("put_shadow_ref underflow, smfn=%lx oc=%08x t=%" 
-               PRtype_info "\n",
-               smfn,
-               mfn_to_page(smfn)->count_info,
-               mfn_to_page(smfn)->u.inuse.type_info);
-        BUG();
-    }
-
-    // Guarded by the shadow lock...
-    //
-    mfn_to_page(smfn)->count_info = nx;
-
-    if ( unlikely(nx == 0) )
-    {
-        free_shadow_page(smfn);
-    }
-}
-
-static inline void
-shadow_pin(unsigned long smfn)
-{
-    ASSERT( !(mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) );
-
-    mfn_to_page(smfn)->u.inuse.type_info |= PGT_pinned;
-    if ( unlikely(!get_shadow_ref(smfn)) )
-        BUG();
-}
-
-static inline void
-shadow_unpin(unsigned long smfn)
-{
-    ASSERT( (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) );
-
-    mfn_to_page(smfn)->u.inuse.type_info &= ~PGT_pinned;
-    put_shadow_ref(smfn);
-}
-
-/*
- * SMP issue. The following code assumes the shadow lock is held. Re-visit
- * when working on finer-gained locks for shadow.
- */
-static inline void set_guest_back_ptr(
-    struct domain *d, l1_pgentry_t spte,
-    unsigned long smfn, unsigned int index)
-{
-    struct page_info *gpage;
-
-    ASSERT(shadow_lock_is_acquired(d));
-
-    if ( !shadow_mode_external(d) || 
-         ((l1e_get_flags(spte) & (_PAGE_PRESENT|_PAGE_RW)) !=
-          (_PAGE_PRESENT|_PAGE_RW)) )
-        return;
-
-    gpage = l1e_get_page(spte);
-
-    ASSERT(smfn != 0);
-    ASSERT(page_to_mfn(gpage) != 0);
-
-    gpage->tlbflush_timestamp = smfn;
-    gpage->u.inuse.type_info &= ~PGT_va_mask;
-    gpage->u.inuse.type_info |= (unsigned long)index << PGT_va_shift;
-}
-
-/************************************************************************/
-#if CONFIG_PAGING_LEVELS <= 2
-extern void shadow_mark_va_out_of_sync(
-    struct vcpu *v, unsigned long gpfn, unsigned long mfn,
-    unsigned long va);
-
-static inline int l1pte_write_fault(
-    struct vcpu *v, l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p,
-    unsigned long va)
-{
-    struct domain *d = v->domain;
-    l1_pgentry_t gpte = *gpte_p;
-    l1_pgentry_t spte;
-    unsigned long gpfn = l1e_get_pfn(gpte);
-    unsigned long gmfn = gmfn_to_mfn(d, gpfn);
-
-    //printk("l1pte_write_fault gmfn=%lx\n", gmfn);
-
-    if ( unlikely(!VALID_MFN(gmfn)) )
-    {
-        SH_VLOG("l1pte_write_fault: invalid gpfn=%lx", gpfn);
-        *spte_p = l1e_empty();
-        return 0;
-    }
-
-    ASSERT(l1e_get_flags(gpte) & _PAGE_RW);
-    l1e_add_flags(gpte, _PAGE_DIRTY | _PAGE_ACCESSED);
-    spte = l1e_from_pfn(gmfn, l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
-
-    SH_VVLOG("l1pte_write_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
-             l1e_get_intpte(spte), l1e_get_intpte(gpte));
-
-    __mark_dirty(d, gmfn);
-
-    if ( mfn_is_page_table(gmfn) )
-        shadow_mark_va_out_of_sync(v, gpfn, gmfn, va);
-
-    *gpte_p = gpte;
-    *spte_p = spte;
-
-    return 1;
-}
-
-static inline int l1pte_read_fault(
-    struct domain *d, l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p)
-{ 
-    l1_pgentry_t gpte = *gpte_p;
-    l1_pgentry_t spte = *spte_p;
-    unsigned long pfn = l1e_get_pfn(gpte);
-    unsigned long mfn = gmfn_to_mfn(d, pfn);
-
-    if ( unlikely(!VALID_MFN(mfn)) )
-    {
-        SH_VLOG("l1pte_read_fault: invalid gpfn=%lx", pfn);
-        *spte_p = l1e_empty();
-        return 0;
-    }
-
-    l1e_add_flags(gpte, _PAGE_ACCESSED);
-    spte = l1e_from_pfn(mfn, l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
-
-    if ( shadow_mode_log_dirty(d) || !(l1e_get_flags(gpte) & _PAGE_DIRTY) ||
-         mfn_is_page_table(mfn) )
-    {
-        l1e_remove_flags(spte, _PAGE_RW);
-    }
-
-    SH_VVLOG("l1pte_read_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
-             l1e_get_intpte(spte), l1e_get_intpte(gpte));
-    *gpte_p = gpte;
-    *spte_p = spte;
-
-    return 1;
-}
-#endif
-
-static inline void l1pte_propagate_from_guest(
-    struct domain *d, guest_l1_pgentry_t gpte, l1_pgentry_t *spte_p)
-{ 
-    unsigned long mfn;
-    l1_pgentry_t spte;
-
-    spte = l1e_empty();
-
-    if ( ((guest_l1e_get_flags(gpte) & (_PAGE_PRESENT|_PAGE_ACCESSED) ) ==
-          (_PAGE_PRESENT|_PAGE_ACCESSED)) &&
-         VALID_MFN(mfn = gmfn_to_mfn(d, l1e_get_pfn(gpte))) )
-    {
-        spte = l1e_from_pfn(
-            mfn, guest_l1e_get_flags(gpte) & ~(_PAGE_GLOBAL | _PAGE_AVAIL));
-
-        if ( shadow_mode_log_dirty(d) ||
-             !(guest_l1e_get_flags(gpte) & _PAGE_DIRTY) ||
-             mfn_is_page_table(mfn) )
-        {
-            l1e_remove_flags(spte, _PAGE_RW);
-        }
-    }
-
-    if ( l1e_get_intpte(spte) || l1e_get_intpte(gpte) )
-        SH_VVVLOG("%s: gpte=%" PRIpte ", new spte=%" PRIpte,
-                  __func__, l1e_get_intpte(gpte), l1e_get_intpte(spte));
-
-    *spte_p = spte;
-}
-
-static inline void hl2e_propagate_from_guest(
-    struct domain *d, l2_pgentry_t gpde, l1_pgentry_t *hl2e_p)
-{
-    unsigned long pfn = l2e_get_pfn(gpde);
-    unsigned long mfn;
-    l1_pgentry_t hl2e;
-    
-    hl2e = l1e_empty();
-
-    if ( l2e_get_flags(gpde) & _PAGE_PRESENT )
-    {
-        mfn = gmfn_to_mfn(d, pfn);
-        if ( VALID_MFN(mfn) && mfn_valid(mfn) )
-            hl2e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
-    }
-
-    if ( l1e_get_intpte(hl2e) || l2e_get_intpte(gpde) )
-        SH_VVLOG("%s: gpde=%" PRIpte " hl2e=%" PRIpte, __func__,
-                 l2e_get_intpte(gpde), l1e_get_intpte(hl2e));
-
-    *hl2e_p = hl2e;
-}
-
-static inline void l2pde_general(
-    struct domain *d,
-    guest_l2_pgentry_t *gpde_p,
-    l2_pgentry_t *spde_p,
-    unsigned long sl1mfn)
-{
-    guest_l2_pgentry_t gpde = *gpde_p;
-    l2_pgentry_t spde;
-
-    spde = l2e_empty();
-    if ( (guest_l2e_get_flags(gpde) & _PAGE_PRESENT) && (sl1mfn != 0) )
-    {
-        spde = l2e_from_pfn(
-            sl1mfn,
-            (guest_l2e_get_flags(gpde) | _PAGE_RW | _PAGE_ACCESSED) & ~_PAGE_AVAIL);
-
-        /* N.B. PDEs do not have a dirty bit. */
-        guest_l2e_add_flags(gpde, _PAGE_ACCESSED);
-
-        *gpde_p = gpde;
-    } 
-
-    if ( l2e_get_intpte(spde) || l2e_get_intpte(gpde) )
-        SH_VVLOG("%s: gpde=%" PRIpte ", new spde=%" PRIpte, __func__,
-                 l2e_get_intpte(gpde), l2e_get_intpte(spde));
-
-    *spde_p = spde;
-}
-
-static inline void l2pde_propagate_from_guest(
-    struct domain *d, guest_l2_pgentry_t *gpde_p, l2_pgentry_t *spde_p)
-{
-    guest_l2_pgentry_t gpde = *gpde_p;
-    unsigned long sl1mfn = 0;
-
-    if ( guest_l2e_get_flags(gpde) & _PAGE_PRESENT )
-        sl1mfn =  __shadow_status(d, l2e_get_pfn(gpde), PGT_l1_shadow);
-    l2pde_general(d, gpde_p, spde_p, sl1mfn);
-}
-    
-/************************************************************************/
-
-// returns true if a tlb flush is needed
-//
-static int inline
-validate_pte_change(
-    struct domain *d,
-    guest_l1_pgentry_t new_pte,
-    l1_pgentry_t *shadow_pte_p)
-{
-    l1_pgentry_t old_spte, new_spte;
-    int need_flush = 0;
-
-    perfc_incrc(validate_pte_calls);
-
-    l1pte_propagate_from_guest(d, new_pte, &new_spte);
-
-    if ( shadow_mode_refcounts(d) )
-    {
-        old_spte = *shadow_pte_p;
-
-        if ( l1e_get_intpte(old_spte) == l1e_get_intpte(new_spte) )
-        {
-            // No accounting required...
-            //
-            perfc_incrc(validate_pte_changes1);
-        }
-        else if ( l1e_get_intpte(old_spte) == (l1e_get_intpte(new_spte)|_PAGE_RW) )
-        {
-            // Fast path for PTEs that have merely been write-protected
-            // (e.g., during a Unix fork()). A strict reduction in privilege.
-            //
-            perfc_incrc(validate_pte_changes2);
-            if ( likely(l1e_get_flags(new_spte) & _PAGE_PRESENT) )
-                shadow_put_page_type(d, mfn_to_page(l1e_get_pfn(new_spte)));
-        }
-        else if ( ((l1e_get_flags(old_spte) | l1e_get_flags(new_spte)) &
-                   _PAGE_PRESENT ) &&
-                  l1e_has_changed(old_spte, new_spte, _PAGE_RW | _PAGE_PRESENT) )
-        {
-            // only do the ref counting if something important changed.
-            //
-            perfc_incrc(validate_pte_changes3);
-
-            if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
-            {
-                shadow_put_page_from_l1e(old_spte, d);
-                need_flush = 1;
-            }
-            if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
-                 !shadow_get_page_from_l1e(new_spte, d) ) {
-                new_spte = l1e_empty();
-                need_flush = -1; /* need to unshadow the page */
-            }
-        }
-        else
-        {
-            perfc_incrc(validate_pte_changes4);
-        }
-    }
-
-    *shadow_pte_p = new_spte;
-
-    return need_flush;
-}
-
-// returns true if a tlb flush is needed
-//
-static int inline
-validate_hl2e_change(
-    struct domain *d,
-    l2_pgentry_t new_gpde,
-    l1_pgentry_t *shadow_hl2e_p)
-{
-    l1_pgentry_t old_hl2e, new_hl2e;
-    int need_flush = 0;
-
-    perfc_incrc(validate_hl2e_calls);
-
-    old_hl2e = *shadow_hl2e_p;
-    hl2e_propagate_from_guest(d, new_gpde, &new_hl2e);
-
-    // Only do the ref counting if something important changed.
-    //
-    if ( ((l1e_get_flags(old_hl2e) | l1e_get_flags(new_hl2e)) & _PAGE_PRESENT) &&
-         l1e_has_changed(old_hl2e, new_hl2e, _PAGE_PRESENT) )
-    {
-        perfc_incrc(validate_hl2e_changes);
-
-        if ( (l1e_get_flags(new_hl2e) & _PAGE_PRESENT) &&
-             !get_page(mfn_to_page(l1e_get_pfn(new_hl2e)), d) )
-            new_hl2e = l1e_empty();
-        if ( l1e_get_flags(old_hl2e) & _PAGE_PRESENT )
-        {
-            put_page(mfn_to_page(l1e_get_pfn(old_hl2e)));
-            need_flush = 1;
-        }
-    }
-
-    *shadow_hl2e_p = new_hl2e;
-
-    return need_flush;
-}
-
-// returns true if a tlb flush is needed
-//
-static int inline
-validate_pde_change(
-    struct domain *d,
-    guest_l2_pgentry_t new_gpde,
-    l2_pgentry_t *shadow_pde_p)
-{
-    l2_pgentry_t old_spde, new_spde;
-    int need_flush = 0;
-
-    perfc_incrc(validate_pde_calls);
-
-    old_spde = *shadow_pde_p;
-    l2pde_propagate_from_guest(d, &new_gpde, &new_spde);
-
-    // Only do the ref counting if something important changed.
-    //
-    if ( ((l2e_get_intpte(old_spde) | l2e_get_intpte(new_spde)) & _PAGE_PRESENT) &&
-         l2e_has_changed(old_spde, new_spde, _PAGE_PRESENT) )
-    {
-        perfc_incrc(validate_pde_changes);
-
-        if ( (l2e_get_flags(new_spde) & _PAGE_PRESENT) &&
-             !get_shadow_ref(l2e_get_pfn(new_spde)) )
-            BUG();
-        if ( l2e_get_flags(old_spde) & _PAGE_PRESENT )
-        {
-            put_shadow_ref(l2e_get_pfn(old_spde));
-            need_flush = 1;
-        }
-    }
-
-    *shadow_pde_p = new_spde;
-
-    return need_flush;
-}
-
-/*********************************************************************/
-
-#if SHADOW_HASH_DEBUG
-
-static void shadow_audit(struct domain *d, int print)
-{
-    int live = 0, free = 0, j = 0, abs;
-    struct shadow_status *a;
-
-    for ( j = 0; j < shadow_ht_buckets; j++ )
-    {
-        a = &d->arch.shadow_ht[j];        
-        if ( a->gpfn_and_flags )
-        {
-            live++;
-            ASSERT(a->smfn);
-        }
-        else
-            ASSERT(!a->next);
-
-        a = a->next;
-        while ( a && (live < 9999) )
-        { 
-            live++;
-            if ( (a->gpfn_and_flags == 0) || (a->smfn == 0) )
-            {
-                printk("XXX live=%d gpfn+flags=%lx sp=%lx next=%p\n",
-                       live, a->gpfn_and_flags, a->smfn, a->next);
-                BUG();
-            }
-            ASSERT(a->smfn);
-            a = a->next;
-        }
-        ASSERT(live < 9999);
-    }
-
-    for ( a = d->arch.shadow_ht_free; a != NULL; a = a->next )
-        free++;
-
-    if ( print )
-        printk("Xlive=%d free=%d\n", live, free);
-
-    // BUG: this only works if there's only a single domain which is
-    //      using shadow tables.
-    //
-    abs = (
-        perfc_value(shadow_l1_pages) +
-        perfc_value(shadow_l2_pages) +
-        perfc_value(hl2_table_pages) +
-        perfc_value(snapshot_pages) +
-        perfc_value(writable_pte_predictions)
-        ) - live;
-#ifdef PERF_COUNTERS
-    if ( (abs < -1) || (abs > 1) )
-    {
-        printk("live=%d free=%d l1=%d l2=%d hl2=%d snapshot=%d writable_ptes=%d\n",
-               live, free,
-               perfc_value(shadow_l1_pages),
-               perfc_value(shadow_l2_pages),
-               perfc_value(hl2_table_pages),
-               perfc_value(snapshot_pages),
-               perfc_value(writable_pte_predictions));
-        BUG();
-    }
-#endif
-
-    // XXX ought to add some code to audit the out-of-sync entries, too.
-    //
-}
-#else
-#define shadow_audit(p, print) ((void)0)
-#endif
-
-
-static inline struct shadow_status *hash_bucket(
-    struct domain *d, unsigned int gpfn)
-{
-    return &d->arch.shadow_ht[gpfn % shadow_ht_buckets];
-}
-
-
-/*
- * N.B. This takes a guest pfn (i.e. a pfn in the guest's namespace,
- *      which, depending on full shadow mode, may or may not equal
- *      its mfn).
- *      It returns the shadow's mfn, or zero if it doesn't exist.
- */
-static inline unsigned long __shadow_status(
-    struct domain *d, unsigned long gpfn, unsigned long stype)
-{
-    struct shadow_status *p, *x, *head;
-    shadow_key_t key;
-#if CONFIG_PAGING_LEVELS >= 3
-    if ( d->arch.ops->guest_paging_levels == PAGING_L3 && stype == PGT_l4_shadow )
-        key = gpfn | stype | index_to_key(get_cr3_idxval(current));
-    else
-#endif
-        key = gpfn | stype;
-
-    ASSERT(shadow_lock_is_acquired(d));
-    ASSERT(gpfn == (gpfn & PGT_mfn_mask));
-    ASSERT(stype && !(stype & ~PGT_type_mask));
-
-    perfc_incrc(shadow_status_calls);
-
-    x = head = hash_bucket(d, gpfn);
-    p = NULL;
-
-    shadow_audit(d, 0);
-
-    do
-    {
-        ASSERT(x->gpfn_and_flags || ((x == head) && (x->next == NULL)));
-
-        if ( x->gpfn_and_flags == key )
-        {
-#if SHADOW_DEBUG
-            if ( unlikely(shadow_status_noswap) )
-                return x->smfn;
-#endif
-            /* Pull-to-front if 'x' isn't already the head item. */
-            if ( unlikely(x != head) )
-            {
-                /* Delete 'x' from list and reinsert immediately after head. */
-                p->next = x->next;
-                x->next = head->next;
-                head->next = x;
-
-                /* Swap 'x' contents with head contents. */
-                SWAP(head->gpfn_and_flags, x->gpfn_and_flags);
-                SWAP(head->smfn, x->smfn);
-            }
-            else
-            {
-                perfc_incrc(shadow_status_hit_head);
-            }
-
-            return head->smfn;
-        }
-
-        p = x;
-        x = x->next;
-    }
-    while ( x != NULL );
-
-    perfc_incrc(shadow_status_miss);
-    return 0;
-}
-
-/*
- * Not clear if pull-to-front is worth while for this or not,
- * as it generally needs to scan the entire bucket anyway.
- * Much simpler without.
- *
- * Either returns PGT_none, or PGT_l{1,2,3,4}_page_table.
- */
-static inline u32
-shadow_max_pgtable_type(struct domain *d, unsigned long gpfn,
-                        unsigned long *smfn)
-{
-    struct shadow_status *x;
-    u32 pttype = PGT_none, type;
-
-    ASSERT(shadow_lock_is_acquired(d));
-    ASSERT(gpfn == (gpfn & PGT_mfn_mask));
-
-    perfc_incrc(shadow_max_type);
-
-    x = hash_bucket(d, gpfn);
-
-    while ( x && x->gpfn_and_flags )
-    {
-        if ( (x->gpfn_and_flags & PGT_mfn_mask) == gpfn )
-        {
-            type = x->gpfn_and_flags & PGT_type_mask;
-
-            switch ( type )
-            {
-            case PGT_hl2_shadow:
-                // Treat an HL2 as if it's an L1
-                //
-                type = PGT_l1_shadow;
-                break;
-            case PGT_snapshot:
-            case PGT_writable_pred:
-                // Ignore snapshots -- they don't in and of themselves constitute
-                // treating a page as a page table
-                //
-                goto next;
-            case PGT_base_page_table:
-                // Early exit if we found the max possible value
-                //
-                return type;
-            default:
-                break;
-            }
-
-            if ( type > pttype )
-            {
-                pttype = type;
-                if ( smfn )
-                    *smfn = x->smfn;
-            }
-        }
-    next:
-        x = x->next;
-    }
-
-    return pttype;
-}
-
-static inline void delete_shadow_status(
-    struct domain *d, unsigned long gpfn, unsigned long gmfn, unsigned int stype, u64 index)
-{
-    struct shadow_status *p, *x, *n, *head;
-
-    shadow_key_t key = gpfn | stype | index_to_key(index);
-
-    ASSERT(shadow_lock_is_acquired(d));
-    ASSERT(!(gpfn & ~PGT_mfn_mask));
-    ASSERT(stype && !(stype & ~PGT_type_mask));
-
-    head = hash_bucket(d, gpfn);
-
-    SH_VLOG("delete gpfn=%lx t=%08x bucket=%p", gpfn, stype, head);
-    shadow_audit(d, 0);
-
-    /* Match on head item? */
-    if ( head->gpfn_and_flags == key )
-    {
-        if ( (n = head->next) != NULL )
-        {
-            /* Overwrite head with contents of following node. */
-            head->gpfn_and_flags = n->gpfn_and_flags;
-            head->smfn           = n->smfn;
-
-            /* Delete following node. */
-            head->next           = n->next;
-
-            /* Add deleted node to the free list. */
-            n->gpfn_and_flags = 0;
-            n->smfn           = 0;
-            n->next           = d->arch.shadow_ht_free;
-            d->arch.shadow_ht_free = n;
-        }
-        else
-        {
-            /* This bucket is now empty. Initialise the head node. */
-            head->gpfn_and_flags = 0;
-            head->smfn           = 0;
-        }
-
-        goto found;
-    }
-
-    p = head;
-    x = head->next;
-
-    do
-    {
-        if ( x->gpfn_and_flags == key )
-        {
-            /* Delete matching node. */
-            p->next = x->next;
-
-            /* Add deleted node to the free list. */
-            x->gpfn_and_flags = 0;
-            x->smfn           = 0;
-            x->next           = d->arch.shadow_ht_free;
-            d->arch.shadow_ht_free = x;
-
-            goto found;
-        }
-
-        p = x;
-        x = x->next;
-    }
-    while ( x != NULL );
-
-    /* If we got here, it wasn't in the list! */
-    BUG();
-
- found:
-    // release ref to page
-    if ( stype != PGT_writable_pred )
-        put_page(mfn_to_page(gmfn));
-
-    shadow_audit(d, 0);
-}
-
-static inline void set_shadow_status(
-    struct domain *d, unsigned long gpfn, unsigned long gmfn,
-    unsigned long smfn, unsigned long stype, u64 index)
-{
-    struct shadow_status *x, *head, *extra;
-    int i;
-
-    shadow_key_t key = gpfn | stype | index_to_key(index);
-
-    SH_VVLOG("set gpfn=%lx gmfn=%lx smfn=%lx t=%lx", gpfn, gmfn, smfn, stype);
-
-    ASSERT(shadow_lock_is_acquired(d));
-
-    ASSERT(shadow_mode_translate(d) || gpfn);
-    ASSERT(!(gpfn & ~PGT_mfn_mask));
-
-    // XXX - need to be more graceful.
-    ASSERT(VALID_MFN(gmfn));
-
-    ASSERT(stype && !(stype & ~PGT_type_mask));
-
-    x = head = hash_bucket(d, gpfn);
-
-    SH_VLOG("set gpfn=%lx smfn=%lx t=%lx bucket=%p(%p)",
-             gpfn, smfn, stype, x, x->next);
-    shadow_audit(d, 0);
-
-    // grab a reference to the guest page to represent the entry in the shadow
-    // hash table
-    //
-    // XXX - Should PGT_writable_pred grab a page ref?
-    //     - Who/how are these hash table entry refs flushed if/when a page
-    //       is given away by the domain?
-    //
-    if ( stype != PGT_writable_pred )
-        get_page(mfn_to_page(gmfn), d);
-
-    /*
-     * STEP 1. If page is already in the table, update it in place.
-     */
-    do
-    {
-        if ( unlikely(x->gpfn_and_flags == key) )
-        {
-            if ( stype != PGT_writable_pred )
-                BUG(); // we should never replace entries into the hash table
-            x->smfn = smfn;
-            if ( stype != PGT_writable_pred )
-                put_page(mfn_to_page(gmfn)); // already had a ref...
-            goto done;
-        }
-
-        x = x->next;
-    }
-    while ( x != NULL );
-
-    /*
-     * STEP 2. The page must be inserted into the table.
-     */
-
-    /* If the bucket is empty then insert the new page as the head item. */
-    if ( head->gpfn_and_flags == 0 )
-    {
-        head->gpfn_and_flags = key;
-        head->smfn           = smfn;
-        ASSERT(head->next == NULL);
-        goto done;
-    }
-
-    /* We need to allocate a new node. Ensure the quicklist is non-empty. */
-    if ( unlikely(d->arch.shadow_ht_free == NULL) )
-    {
-        SH_VLOG("Allocate more shadow hashtable blocks.");
-
-        extra = xmalloc_bytes(
-            sizeof(void *) + (shadow_ht_extra_size * sizeof(*x)));
-
-        /* XXX Should be more graceful here. */
-        if ( extra == NULL )
-            BUG();
-
-        memset(extra, 0, sizeof(void *) + (shadow_ht_extra_size * sizeof(*x)));
-
-        /* Record the allocation block so it can be correctly freed later. */
-        d->arch.shadow_extras_count++;
-        *((struct shadow_status **)&extra[shadow_ht_extra_size]) = 
-            d->arch.shadow_ht_extras;
-        d->arch.shadow_ht_extras = &extra[0];
-
-        /* Thread a free chain through the newly-allocated nodes. */
-        for ( i = 0; i < (shadow_ht_extra_size - 1); i++ )
-            extra[i].next = &extra[i+1];
-        extra[i].next = NULL;
-
-        /* Add the new nodes to the free list. */
-        d->arch.shadow_ht_free = &extra[0];
-    }
-
-    /* Allocate a new node from the quicklist. */
-    x                      = d->arch.shadow_ht_free;
-    d->arch.shadow_ht_free = x->next;
-
-    /* Initialise the new node and insert directly after the head item. */
-    x->gpfn_and_flags = key;
-    x->smfn           = smfn;
-    x->next           = head->next;
-    head->next        = x;
-
- done:
-    shadow_audit(d, 0);
-
-    if ( stype <= PGT_l4_shadow )
-    {
-        // add to front of list of pages to check when removing write
-        // permissions for a page...
-        //
-    }
-}
-
-/************************************************************************/
-
-static inline void guest_physmap_add_page(
-    struct domain *d, unsigned long gpfn, unsigned long mfn)
-{
-    struct domain_mmap_cache c1, c2;
-
-    if ( likely(!shadow_mode_translate(d)) )
-        return;
-
-    domain_mmap_cache_init(&c1);
-    domain_mmap_cache_init(&c2);
-    shadow_lock(d);
-    shadow_sync_and_drop_references(d, mfn_to_page(mfn));
-    set_p2m_entry(d, gpfn, mfn, &c1, &c2);
-    set_gpfn_from_mfn(mfn, gpfn);
-    shadow_unlock(d);
-    domain_mmap_cache_destroy(&c1);
-    domain_mmap_cache_destroy(&c2);
-}
-
-static inline void guest_physmap_remove_page(
-    struct domain *d, unsigned long gpfn, unsigned long mfn)
-{
-    struct domain_mmap_cache c1, c2;
-    unsigned long type;
-
-    if ( likely(!shadow_mode_translate(d)) )
-        return;
-
-    domain_mmap_cache_init(&c1);
-    domain_mmap_cache_init(&c2);
-    shadow_lock(d);
-    shadow_sync_and_drop_references(d, mfn_to_page(mfn));
-    while ( (type = shadow_max_pgtable_type(d, gpfn, NULL)) != PGT_none )
-        free_shadow_page(__shadow_status(d, gpfn, type));
-    set_p2m_entry(d, gpfn, -1, &c1, &c2);
-    set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
-    shadow_unlock(d);
-    domain_mmap_cache_destroy(&c1);
-    domain_mmap_cache_destroy(&c2);
-}
-
-/************************************************************************/
-
-void static inline
-shadow_update_min_max(unsigned long smfn, int index)
-{
-    struct page_info *sl1page = mfn_to_page(smfn);
-    u32 min_max = sl1page->tlbflush_timestamp;
-    int min = SHADOW_MIN(min_max);
-    int max = SHADOW_MAX(min_max);
-    int update = 0;
-
-    if ( index < min )
-    {
-        min = index;
-        update = 1;
-    }
-    if ( index > max )
-    {
-        max = index;
-        update = 1;
-    }
-    if ( update )
-        sl1page->tlbflush_timestamp = SHADOW_ENCODE_MIN_MAX(min, max);
-}
-
-#if CONFIG_PAGING_LEVELS <= 2
-extern void shadow_map_l1_into_current_l2(unsigned long va);
-
-void static inline
-shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow)
-{
-    struct vcpu *v = current;
-    struct domain *d = v->domain;
-    l2_pgentry_t sl2e = {0};
-
-    __shadow_get_l2e(v, va, &sl2e);
-    if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
-    {
-        /*
-         * Either the L1 is not shadowed, or the shadow isn't linked into
-         * the current shadow L2.
-         */
-        if ( create_l1_shadow )
-        {
-            perfc_incrc(shadow_set_l1e_force_map);
-            shadow_map_l1_into_current_l2(va);
-        }
-        else /* check to see if it exists; if so, link it in */
-        {
-            l2_pgentry_t gpde = linear_l2_table(v)[l2_table_offset(va)];
-            unsigned long gl1pfn = l2e_get_pfn(gpde);
-            unsigned long sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow);
-
-            ASSERT( l2e_get_flags(gpde) & _PAGE_PRESENT );
-
-            if ( sl1mfn )
-            {
-                perfc_incrc(shadow_set_l1e_unlinked);
-                if ( !get_shadow_ref(sl1mfn) )
-                    BUG();
-                l2pde_general(d, &gpde, &sl2e, sl1mfn);
-                __guest_set_l2e(v, va, gpde);
-                __shadow_set_l2e(v, va, sl2e);
-            }
-            else
-            {
-                // no shadow exists, so there's nothing to do.
-                perfc_incrc(shadow_set_l1e_fail);
-                return;
-            }
-        }
-    }
-
-    __shadow_get_l2e(v, va, &sl2e);
-
-    if ( shadow_mode_refcounts(d) )
-    {
-        l1_pgentry_t old_spte = shadow_linear_pg_table[l1_linear_offset(va)];
-
-        // only do the ref counting if something important changed.
-        //
-        if ( l1e_has_changed(old_spte, new_spte, _PAGE_RW | _PAGE_PRESENT) )
-        {
-            if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
-                 !shadow_get_page_from_l1e(new_spte, d) )
-                new_spte = l1e_empty();
-            if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
-                shadow_put_page_from_l1e(old_spte, d);
-        }
-
-    }
-
-    set_guest_back_ptr(d, new_spte, l2e_get_pfn(sl2e), l1_table_offset(va));
-    shadow_linear_pg_table[l1_linear_offset(va)] = new_spte;
-    shadow_update_min_max(l2e_get_pfn(sl2e), l1_table_offset(va));
-}
-#endif
-/************************************************************************/
-
-static inline int
-shadow_mode_page_writable(unsigned long va, struct cpu_user_regs *regs, unsigned long gpfn)
-{
-    struct vcpu *v = current;
-    struct domain *d = v->domain;
-    unsigned long mfn = gmfn_to_mfn(d, gpfn);
-    u32 type = mfn_to_page(mfn)->u.inuse.type_info & PGT_type_mask;
-
-    if ( shadow_mode_refcounts(d) &&
-         (type == PGT_writable_page) )
-        type = shadow_max_pgtable_type(d, gpfn, NULL);
-
-    // Strange but true: writable page tables allow kernel-mode access
-    // to L1 page table pages via write-protected PTEs...  Similarly, write 
-    // access to all page table pages is granted for shadow_mode_write_all
-    // clients.
-    //
-    if ( ((shadow_mode_write_l1(d) && (type == PGT_l1_page_table)) ||
-          (shadow_mode_write_all(d) && type && (type <= PGT_l4_page_table))) &&
-         ((va < HYPERVISOR_VIRT_START)
-#if defined(__x86_64__)
-          || (va >= HYPERVISOR_VIRT_END)
-#endif
-             ) &&
-         guest_kernel_mode(v, regs) )
-        return 1;
-
-    return 0;
-}
-
-#if CONFIG_PAGING_LEVELS <= 2
-static inline l1_pgentry_t gva_to_gpte(unsigned long gva)
-{
-    l2_pgentry_t gpde;
-    l1_pgentry_t gpte;
-    struct vcpu *v = current;
-
-    ASSERT( shadow_mode_translate(current->domain) );
-
-    __guest_get_l2e(v, gva, &gpde);
-    if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
-        return l1e_empty();;
-
-    // This is actually overkill - we only need to make sure the hl2
-    // is in-sync.
-    //
-    shadow_sync_va(v, gva);
-
-    if ( unlikely(__copy_from_user(&gpte,
-                                   &linear_pg_table[gva >> PAGE_SHIFT],
-                                   sizeof(gpte))) )
-    {
-        FSH_LOG("gva_to_gpte got a fault on gva=%lx", gva);
-        return l1e_empty();
-    }
-
-    return gpte;
-}
-
-static inline unsigned long gva_to_gpa(unsigned long gva)
-{
-    l1_pgentry_t gpte;
-
-    gpte = gva_to_gpte(gva);
-    if ( !(l1e_get_flags(gpte) & _PAGE_PRESENT) )
-        return 0;
-
-    return l1e_get_paddr(gpte) + (gva & ~PAGE_MASK);
-}
-#endif
-
-static inline unsigned long gva_to_mfn(unsigned long gva)
-{
-    unsigned long gpa = gva_to_gpa(gva);
-    return get_mfn_from_gpfn(gpa >> PAGE_SHIFT);
-}
-
-/************************************************************************/
-
-extern void __update_pagetables(struct vcpu *v);
-static inline void update_pagetables(struct vcpu *v)
-{
-    struct domain *d = v->domain;
-    int paging_enabled;
-
-    if ( hvm_guest(v) )
-        paging_enabled = hvm_paging_enabled(v);
-    else
-        // HACK ALERT: there's currently no easy way to figure out if a domU
-        // has set its arch.guest_table to zero, vs not yet initialized it.
-        //
-        paging_enabled = !!pagetable_get_paddr(v->arch.guest_table);
-
-    /*
-     * We don't call __update_pagetables() when hvm guest paging is
-     * disabled as we want the linear_pg_table to be inaccessible so that
-     * we bail out early of shadow_fault() if the hvm guest tries illegal
-     * accesses while it thinks paging is turned off.
-     */
-    if ( unlikely(shadow_mode_enabled(d)) && paging_enabled )
-    {
-        shadow_lock(d);
-        __update_pagetables(v);
-        shadow_unlock(d);
-    }
-
-    if ( likely(!shadow_mode_external(d)) )
-    {
-        if ( shadow_mode_enabled(d) )
-            v->arch.monitor_table = v->arch.shadow_table;
-        else
-#if CONFIG_PAGING_LEVELS == 4
-        if ( !(v->arch.flags & TF_kernel_mode) )
-            v->arch.monitor_table = v->arch.guest_table_user;
-        else
-#endif
-            v->arch.monitor_table = v->arch.guest_table;
-    }
-}
-
-void clear_all_shadow_status(struct domain *d);
-
-#if SHADOW_DEBUG
-extern int _check_pagetable(struct vcpu *v, char *s);
-extern int _check_all_pagetables(struct vcpu *v, char *s);
-
-#define check_pagetable(_v, _s) _check_pagetable(_v, _s)
-//#define check_pagetable(_v, _s) _check_all_pagetables(_v, _s)
-
-#else
-#define check_pagetable(_v, _s) ((void)0)
-#endif
-
-#endif /* XEN_SHADOW_H */
+#endif /* _XEN_SHADOW_H */
 
 /*
  * Local variables:
diff --git a/xen/include/asm-x86/shadow2-multi.h b/xen/include/asm-x86/shadow2-multi.h
new file mode 100644 (file)
index 0000000..3b23a2f
--- /dev/null
@@ -0,0 +1,116 @@
+/******************************************************************************
+ * arch/x86/shadow2-multi.h
+ *
+ * Shadow2 declarations which will be multiply compiled.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+extern int 
+SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, SHADOW_LEVELS, GUEST_LEVELS)(
+    struct vcpu *v, mfn_t gl1mfn, void *new_gl1p, u32 size);
+extern int 
+SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, SHADOW_LEVELS, GUEST_LEVELS)(
+    struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size);
+extern int 
+SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2he, SHADOW_LEVELS, GUEST_LEVELS)(
+    struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size);
+extern int 
+SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, SHADOW_LEVELS, GUEST_LEVELS)(
+    struct vcpu *v, mfn_t gl3mfn, void *new_gl3p, u32 size);
+extern int 
+SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl4e, SHADOW_LEVELS, GUEST_LEVELS)(
+    struct vcpu *v, mfn_t gl4mfn, void *new_gl4p, u32 size);
+
+extern void 
+SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
+    struct vcpu *v, mfn_t smfn);
+extern void 
+SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
+    struct vcpu *v, mfn_t smfn);
+extern void 
+SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
+    struct vcpu *v, mfn_t smfn);
+extern void 
+SHADOW2_INTERNAL_NAME(sh2_destroy_l4_shadow, SHADOW_LEVELS, GUEST_LEVELS)(
+    struct vcpu *v, mfn_t smfn);
+
+extern void
+SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows, 3, 3)
+    (struct vcpu *v, mfn_t smfn);
+
+extern void 
+SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings, SHADOW_LEVELS, GUEST_LEVELS)
+    (struct vcpu *v, mfn_t sl2mfn);
+extern void 
+SHADOW2_INTERNAL_NAME(sh2_unhook_pae_mappings, SHADOW_LEVELS, GUEST_LEVELS)
+    (struct vcpu *v, mfn_t sl3mfn);
+extern void 
+SHADOW2_INTERNAL_NAME(sh2_unhook_64b_mappings, SHADOW_LEVELS, GUEST_LEVELS)
+    (struct vcpu *v, mfn_t sl4mfn);
+
+extern int
+SHADOW2_INTERNAL_NAME(sh2_remove_write_access, SHADOW_LEVELS, GUEST_LEVELS)
+    (struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn);
+extern int
+SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings, SHADOW_LEVELS, GUEST_LEVELS)
+    (struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn);
+
+extern void
+SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry, SHADOW_LEVELS, GUEST_LEVELS)
+    (struct vcpu *v, void *ep, mfn_t smfn);
+
+extern int
+SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow, SHADOW_LEVELS, GUEST_LEVELS)
+    (struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn);
+extern int
+SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow, SHADOW_LEVELS, GUEST_LEVELS)
+    (struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn);
+extern int
+SHADOW2_INTERNAL_NAME(sh2_remove_l3_shadow, SHADOW_LEVELS, GUEST_LEVELS)
+    (struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn);
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES
+int 
+SHADOW2_INTERNAL_NAME(sh2_audit_l1_table, SHADOW_LEVELS, GUEST_LEVELS)
+    (struct vcpu *v, mfn_t sl1mfn, mfn_t x);
+int 
+SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table, SHADOW_LEVELS, GUEST_LEVELS)
+    (struct vcpu *v, mfn_t sl1mfn, mfn_t x);
+int 
+SHADOW2_INTERNAL_NAME(sh2_audit_l2_table, SHADOW_LEVELS, GUEST_LEVELS)
+    (struct vcpu *v, mfn_t sl2mfn, mfn_t x);
+int 
+SHADOW2_INTERNAL_NAME(sh2_audit_l3_table, SHADOW_LEVELS, GUEST_LEVELS)
+    (struct vcpu *v, mfn_t sl3mfn, mfn_t x);
+int 
+SHADOW2_INTERNAL_NAME(sh2_audit_l4_table, SHADOW_LEVELS, GUEST_LEVELS)
+    (struct vcpu *v, mfn_t sl4mfn, mfn_t x);
+#endif
+
+#if SHADOW_LEVELS == GUEST_LEVELS
+extern mfn_t
+SHADOW2_INTERNAL_NAME(sh2_make_monitor_table, SHADOW_LEVELS, GUEST_LEVELS)
+    (struct vcpu *v);
+extern void
+SHADOW2_INTERNAL_NAME(sh2_destroy_monitor_table, SHADOW_LEVELS, GUEST_LEVELS)
+    (struct vcpu *v, mfn_t mmfn);
+#endif
+
+extern struct shadow2_entry_points 
+SHADOW2_INTERNAL_NAME(shadow2_entry, SHADOW_LEVELS, GUEST_LEVELS);
diff --git a/xen/include/asm-x86/shadow2-private.h b/xen/include/asm-x86/shadow2-private.h
new file mode 100644 (file)
index 0000000..7b2ac57
--- /dev/null
@@ -0,0 +1,612 @@
+/******************************************************************************
+ * arch/x86/shadow2-private.h
+ *
+ * Shadow2 code that is private, and does not need to be multiply compiled.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _XEN_SHADOW2_PRIVATE_H
+#define _XEN_SHADOW2_PRIVATE_H
+
+// In order to override the definition of mfn_to_page, we make sure page.h has
+// been included...
+#include <asm/page.h>
+#include <xen/domain_page.h>
+#include <asm/x86_emulate.h>
+#include <asm/hvm/support.h>
+
+
+/******************************************************************************
+ * Definitions for the use of the "available" bits in the shadow PTEs.
+ *
+ * Review of the low 12 bits of a shadow page table entry:
+ *
+ *         in a guest:                      in a shadow:
+ * Bit 11: _PAGE_AVAIL2, aka _PAGE_GNTTAB
+ * Bit 10: _PAGE_AVAIL1                     _PAGE_SHADOW_RW ("SW" below)
+ * Bit  9: _PAGE_AVAIL0                     _PAGE_SHADOW_PRESENT ("SP" below)
+ * Bit  8: _PAGE_GLOBAL                     _PAGE_SHADOW_MMIO ("MMIO" below),
+ *                                          aka _PAGE_SHADOW_GUEST_NOT_PRESENT
+ * Bit  7: _PAGE_PSE, aka _PAGE_PAT
+ * Bit  6: _PAGE_DIRTY
+ * Bit  5: _PAGE_ACCESSED
+ * Bit  4: _PAGE_PCD
+ * Bit  3: _PAGE_PWT
+ * Bit  2: _PAGE_USER
+ * Bit  1: _PAGE_RW ("GW" below)
+ * Bit  0: _PAGE_PRESENT ("GP" below)
+ *
+ * Given a guest entry, as shown below, we can expect the following in the
+ * corresponding shadow entry:
+ *
+ * Guest entry  Shadow entry      Commentary
+ * -----------  ----------------  ---------------------------------------------
+ *       Maps     
+ * GP GW  IO    GP SP GW SW MMIO 
+ * -- -- ----   -- -- -- -- ----
+ *  -  -   -     0  0  0  0   0   The guest entry has not yet been shadowed.
+ *  0  -   -     0  0  0  0   1   The guest entry is marked not-present.
+ *  1  1  no     ?  1  ?  1   0   Writable entry in the guest.
+ *  1  0  no     ?  1  0  0   0   Read-only entry in the guest.
+ *  1  1  yes    0  1  ?  1   1   Writable MMIO mapping in the guest.
+ *  1  0  yes    0  1  0  0   1   Read-only MMIO mapping in the guest.
+ *
+ * Normally, we would expect that GP=1 in the guest to imply GP=1 in the
+ * shadow, and similarly for GW=1.  However, various functionality that may be
+ * implemented via the shadow can cause GP or GW to be cleared in such cases.
+ * A & D bit emulation is a prime example of such functionality.
+ *
+ * If _PAGE_SHADOW_PRESENT is zero, then the _PAGE_PRESENT bit in that same
+ * entry will always be zero, too.
+
+ * Bit 11 is used in debug builds as the _PAGE_GNTTAB bit in PV guests.  It is
+ * currently available for random (ab)use in shadow entries.
+ *
+ * Bit 8 (the global bit) could be propagated from an HVM guest to the shadow,
+ * but currently there is no benefit, as the guest's TLB is flushed on every
+ * transition of CR3 anyway due to the HVM exit/re-entry.
+ *
+ * In shadow entries in which the _PAGE_SHADOW_PRESENT is set, bit 8 is used
+ * as the _PAGE_SHADOW_MMIO bit.  In such entries, if _PAGE_SHADOW_MMIO is
+ * set, then the entry contains the *gfn* directly from the corresponding
+ * guest entry (not an mfn!!).
+ *
+ * Bit 7 is set in a guest L2 to signify a superpage entry.  The current
+ * shadow code splinters superpage mappings into 512 or 1024 4K mappings; the
+ * resulting shadow L1 table is called an FL1.  Note that there is no guest
+ * page that corresponds to an FL1.
+ *
+ * Bit 7 in a guest L1 is the PAT2 bit.  Currently we do not support PAT in
+ * this shadow code.
+ *
+ * Bit 6 is the dirty bit.
+ *
+ * Bit 5 is the accessed bit.
+ *
+ * Bit 4 is the cache disable bit.  If set in a guest, the hardware is
+ * supposed to refuse to cache anything found via this entry.  It can be set
+ * in an L4e, L3e, L2e, or L1e.  This shadow code currently does not support
+ * cache disable bits.  They are silently ignored.
+ *
+ * Bit 4 is a guest L1 is also the PAT1 bit.  Currently we do not support PAT
+ * in this shadow code.
+ *
+ * Bit 3 is the cache write-thru bit.  If set in a guest, the hardware is
+ * supposed to use write-thru instead of write-back caching for anything found
+ * via this entry.  It can be set in an L4e, L3e, L2e, or L1e.  This shadow
+ * code currently does not support cache write-thru bits.  They are silently
+ * ignored.
+ *
+ * Bit 3 is a guest L1 is also the PAT0 bit.  Currently we do not support PAT
+ * in this shadow code.
+ *
+ * Bit 2 is the user bit.
+ *
+ * Bit 1 is the read-write bit.
+ *
+ * Bit 0 is the present bit.
+ */
+
+// Copy of the _PAGE_RW bit from the guest's PTE, appropriately zero'ed by
+// the appropriate shadow rules.
+#define _PAGE_SHADOW_RW                 _PAGE_AVAIL1
+
+// Copy of the _PAGE_PRESENT bit from the guest's PTE
+#define _PAGE_SHADOW_PRESENT            _PAGE_AVAIL0
+
+// The matching guest entry maps MMIO space
+#define _PAGE_SHADOW_MMIO               _PAGE_GLOBAL
+
+// Shadow flags value used when the guest is not present
+#define _PAGE_SHADOW_GUEST_NOT_PRESENT  _PAGE_GLOBAL
+
+
+/******************************************************************************
+ * Debug and error-message output
+ */
+#define SHADOW2_PRINTK(_f, _a...)                                     \
+    debugtrace_printk("sh2: %s(): " _f, __func__, ##_a)
+#define SHADOW2_ERROR(_f, _a...)                                      \
+    printk("sh2 error: %s(): " _f, __func__, ##_a)
+#define SHADOW2_DEBUG(flag, _f, _a...)                                \
+    do {                                                              \
+        if (SHADOW2_DEBUG_ ## flag)                                   \
+            debugtrace_printk("sh2debug: %s(): " _f, __func__, ##_a); \
+    } while (0)
+
+// The flags for use with SHADOW2_DEBUG:
+#define SHADOW2_DEBUG_PROPAGATE         0
+#define SHADOW2_DEBUG_MAKE_SHADOW       0
+#define SHADOW2_DEBUG_DESTROY_SHADOW    0
+#define SHADOW2_DEBUG_P2M               0
+#define SHADOW2_DEBUG_A_AND_D           0
+#define SHADOW2_DEBUG_EMULATE           0
+#define SHADOW2_DEBUG_LOGDIRTY          1
+
+
+/******************************************************************************
+ * Auditing routines 
+ */
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL
+extern void shadow2_audit_tables(struct vcpu *v);
+#else
+#define shadow2_audit_tables(_v) do {} while(0)
+#endif
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_P2M
+extern void shadow2_audit_p2m(struct domain *d);
+#else
+#define shadow2_audit_p2m(_d) do {} while(0)
+#endif
+
+
+/******************************************************************************
+ * Mechanism for double-checking the optimized pagefault path: this
+ * structure contains a record of actions taken by the fault handling
+ * code.  In paranoid mode, the fast-path code fills out one of these
+ * structures (but doesn't take any actual action) and then the normal 
+ * path fills in another.  When the fault handler finishes, the 
+ * two are compared */
+
+#ifdef SHADOW2_OPTIMIZATION_PARANOIA
+
+typedef struct shadow2_action_log sh2_log_t;
+struct shadow2_action_log {
+    paddr_t ad[CONFIG_PAGING_LEVELS];  /* A & D bits propagated here */
+    paddr_t mmio;                      /* Address of an mmio operation */
+    int rv;                            /* Result of the fault handler */
+};
+
+/* There are two logs, one for the fast path, one for the normal path */
+enum sh2_log_type { log_slow = 0, log_fast= 1 };
+
+/* Alloc and zero the logs */
+static inline void sh2_init_log(struct vcpu *v) 
+{
+    if ( unlikely(!v->arch.shadow2_action_log) ) 
+        v->arch.shadow2_action_log = xmalloc_array(sh2_log_t, 2);
+    ASSERT(v->arch.shadow2_action_log);
+    memset(v->arch.shadow2_action_log, 0, 2 * sizeof (sh2_log_t));
+}
+
+/* Log an A&D-bit update */
+static inline void sh2_log_ad(struct vcpu *v, paddr_t e, unsigned int level)
+{
+    v->arch.shadow2_action_log[v->arch.shadow2_action_index].ad[level] = e;
+}
+
+/* Log an MMIO address */
+static inline void sh2_log_mmio(struct vcpu *v, paddr_t m)
+{
+    v->arch.shadow2_action_log[v->arch.shadow2_action_index].mmio = m;
+}
+
+/* Log the result */
+static inline void sh2_log_rv(struct vcpu *v, int rv)
+{
+    v->arch.shadow2_action_log[v->arch.shadow2_action_index].rv = rv;
+}
+
+/* Set which mode we're in */
+static inline void sh2_set_log_mode(struct vcpu *v, enum sh2_log_type t) 
+{
+    v->arch.shadow2_action_index = t;
+}
+
+/* Know not to take action, because we're only checking the mechanism */
+static inline int sh2_take_no_action(struct vcpu *v) 
+{
+    return (v->arch.shadow2_action_index == log_fast);
+}
+
+#else /* Non-paranoid mode: these logs do not exist */
+
+#define sh2_init_log(_v) do { (void)(_v); } while(0)
+#define sh2_set_log_mode(_v,_t) do { (void)(_v); } while(0)
+#define sh2_log_ad(_v,_e,_l) do { (void)(_v),(void)(_e),(void)(_l); } while (0)
+#define sh2_log_mmio(_v,_m) do { (void)(_v),(void)(_m); } while (0)
+#define sh2_log_rv(_v,_r) do { (void)(_v),(void)(_r); } while (0)
+#define sh2_take_no_action(_v) (((void)(_v)), 0)
+
+#endif /* SHADOW2_OPTIMIZATION_PARANOIA */
+
+
+/******************************************************************************
+ * Macro for dealing with the naming of the internal names of the
+ * shadow code's external entry points.
+ */
+#define SHADOW2_INTERNAL_NAME_HIDDEN(name, shadow_levels, guest_levels) \
+    name ## __shadow_ ## shadow_levels ## _guest_ ## guest_levels
+#define SHADOW2_INTERNAL_NAME(name, shadow_levels, guest_levels) \
+    SHADOW2_INTERNAL_NAME_HIDDEN(name, shadow_levels, guest_levels)
+
+#if CONFIG_PAGING_LEVELS == 2
+#define GUEST_LEVELS  2
+#define SHADOW_LEVELS 2
+#include <asm/shadow2-multi.h>
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+#endif /* CONFIG_PAGING_LEVELS == 2 */
+
+#if CONFIG_PAGING_LEVELS == 3
+#define GUEST_LEVELS  2
+#define SHADOW_LEVELS 3
+#include <asm/shadow2-multi.h>
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+
+#define GUEST_LEVELS  3
+#define SHADOW_LEVELS 3
+#include <asm/shadow2-multi.h>
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+#endif /* CONFIG_PAGING_LEVELS == 3 */
+
+#if CONFIG_PAGING_LEVELS == 4
+#define GUEST_LEVELS  2
+#define SHADOW_LEVELS 3
+#include <asm/shadow2-multi.h>
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+
+#define GUEST_LEVELS  3
+#define SHADOW_LEVELS 3
+#include <asm/shadow2-multi.h>
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+
+#define GUEST_LEVELS  3
+#define SHADOW_LEVELS 4
+#include <asm/shadow2-multi.h>
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+
+#define GUEST_LEVELS  4
+#define SHADOW_LEVELS 4
+#include <asm/shadow2-multi.h>
+#undef GUEST_LEVELS
+#undef SHADOW_LEVELS
+#endif /* CONFIG_PAGING_LEVELS == 4 */
+
+
+/******************************************************************************
+ * Various function declarations 
+ */
+
+/* x86 emulator support */
+extern struct x86_emulate_ops shadow2_emulator_ops;
+
+/* Hash table functions */
+mfn_t shadow2_hash_lookup(struct vcpu *v, unsigned long n, u8 t);
+void  shadow2_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn);
+void  shadow2_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn);
+
+/* shadow promotion */
+void shadow2_promote(struct vcpu *v, mfn_t gmfn, u32 type);
+void shadow2_demote(struct vcpu *v, mfn_t gmfn, u32 type);
+
+/* Shadow page allocation functions */
+void  shadow2_prealloc(struct domain *d, unsigned int order);
+mfn_t shadow2_alloc(struct domain *d, 
+                    u32 shadow_type,
+                    unsigned long backpointer);
+void  shadow2_free(struct domain *d, mfn_t smfn);
+
+/* Function to convert a shadow to log-dirty */
+void shadow2_convert_to_log_dirty(struct vcpu *v, mfn_t smfn);
+
+/* Dispatcher function: call the per-mode function that will unhook the
+ * non-Xen mappings in this top-level shadow mfn */
+void shadow2_unhook_mappings(struct vcpu *v, mfn_t smfn);
+
+/* Re-sync copies of PAE shadow L3 tables if they have been changed */
+void sh2_pae_recopy(struct domain *d);
+
+/* Install the xen mappings in various flavours of shadow */
+void sh2_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn);
+void sh2_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn);
+void sh2_install_xen_entries_in_l3(struct vcpu *v, mfn_t gl3mfn, mfn_t sl3mfn);
+void sh2_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn);
+
+
+/******************************************************************************
+ * MFN/page-info handling 
+ */
+
+// Override mfn_to_page from asm/page.h, which was #include'd above,
+// in order to make it work with our mfn type.
+#undef mfn_to_page
+#define mfn_to_page(_mfn) (frame_table + mfn_x(_mfn))
+
+// Override page_to_mfn from asm/page.h, which was #include'd above,
+// in order to make it work with our mfn type.
+#undef page_to_mfn
+#define page_to_mfn(_pg) (_mfn((_pg) - frame_table))
+
+// Override mfn_valid from asm/page.h, which was #include'd above,
+// in order to make it work with our mfn type.
+#undef mfn_valid
+#define mfn_valid(_mfn) (mfn_x(_mfn) < max_page)
+
+// Provide mfn_t-aware versions of common xen functions
+static inline void *
+sh2_map_domain_page(mfn_t mfn)
+{
+    /* XXX Using the monitor-table as a map will happen here  */
+    return map_domain_page(mfn_x(mfn));
+}
+
+static inline void 
+sh2_unmap_domain_page(void *p) 
+{
+    /* XXX Using the monitor-table as a map will happen here  */
+    unmap_domain_page(p);
+}
+
+static inline void *
+sh2_map_domain_page_global(mfn_t mfn)
+{
+    /* XXX Using the monitor-table as a map will happen here  */
+    return map_domain_page_global(mfn_x(mfn));
+}
+
+static inline void 
+sh2_unmap_domain_page_global(void *p) 
+{
+    /* XXX Using the monitor-table as a map will happen here  */
+    unmap_domain_page_global(p);
+}
+
+static inline int
+sh2_mfn_is_dirty(struct domain *d, mfn_t gmfn)
+/* Is this guest page dirty?  Call only in log-dirty mode. */
+{
+    unsigned long pfn;
+    ASSERT(shadow2_mode_log_dirty(d));
+    ASSERT(d->arch.shadow_dirty_bitmap != NULL);
+
+    /* We /really/ mean PFN here, even for non-translated guests. */
+    pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+    if ( likely(VALID_M2P(pfn))
+         && likely(pfn < d->arch.shadow_dirty_bitmap_size) 
+         && test_bit(pfn, d->arch.shadow_dirty_bitmap) )
+        return 1;
+
+    return 0;
+}
+
+static inline int
+sh2_mfn_is_a_page_table(mfn_t gmfn)
+{
+    struct page_info *page = mfn_to_page(gmfn);
+    struct domain *owner;
+    unsigned long type_info;
+
+    if ( !valid_mfn(gmfn) )
+        return 0;
+
+    owner = page_get_owner(page);
+    if ( owner && shadow2_mode_refcounts(owner) 
+         && (page->count_info & PGC_page_table) )
+        return 1; 
+
+    type_info = page->u.inuse.type_info & PGT_type_mask;
+    return type_info && (type_info <= PGT_l4_page_table);
+}
+
+
+/**************************************************************************/
+/* Shadow-page refcounting. See comment in shadow2-common.c about the  
+ * use of struct page_info fields for shadow pages */
+
+void sh2_destroy_shadow(struct vcpu *v, mfn_t smfn);
+
+/* Increase the refcount of a shadow page.  Arguments are the mfn to refcount, 
+ * and the physical address of the shadow entry that holds the ref (or zero
+ * if the ref is held by something else) */
+static inline void sh2_get_ref(mfn_t smfn, paddr_t entry_pa)
+{
+    u32 x, nx;
+    struct page_info *page = mfn_to_page(smfn);
+
+    ASSERT(mfn_valid(smfn));
+
+    x = page->count_info & PGC_SH2_count_mask;
+    nx = x + 1;
+
+    if ( unlikely(nx & ~PGC_SH2_count_mask) )
+    {
+        SHADOW2_PRINTK("shadow ref overflow, gmfn=%" PRtype_info " smfn=%lx\n",
+                       page->u.inuse.type_info, mfn_x(smfn));
+        domain_crash_synchronous();
+    }
+    
+    /* Guarded by the shadow lock, so no need for atomic update */
+    page->count_info &= ~PGC_SH2_count_mask;
+    page->count_info |= nx;
+
+    /* We remember the first shadow entry that points to each shadow. */
+    if ( entry_pa != 0 && page->up == 0 ) 
+        page->up = entry_pa;
+}
+
+
+/* Decrease the refcount of a shadow page.  As for get_ref, takes the
+ * physical address of the shadow entry that held this reference. */
+static inline void sh2_put_ref(struct vcpu *v, mfn_t smfn, paddr_t entry_pa)
+{
+    u32 x, nx;
+    struct page_info *page = mfn_to_page(smfn);
+
+    ASSERT(mfn_valid(smfn));
+    ASSERT(page_get_owner(page) == NULL);
+
+    /* If this is the entry in the up-pointer, remove it */
+    if ( entry_pa != 0 && page->up == entry_pa ) 
+        page->up = 0;
+
+    x = page->count_info & PGC_SH2_count_mask;
+    nx = x - 1;
+
+    if ( unlikely(x == 0) ) 
+    {
+        SHADOW2_PRINTK("shadow ref underflow, smfn=%lx oc=%08x t=%" 
+                       PRtype_info "\n",
+                       mfn_x(smfn),
+                       page->count_info & PGC_SH2_count_mask,
+                       page->u.inuse.type_info);
+        domain_crash_synchronous();
+    }
+
+    /* Guarded by the shadow lock, so no need for atomic update */
+    page->count_info &= ~PGC_SH2_count_mask;
+    page->count_info |= nx;
+
+    if ( unlikely(nx == 0) ) 
+        sh2_destroy_shadow(v, smfn);
+}
+
+
+/* Pin a shadow page: take an extra refcount and set the pin bit. */
+static inline void sh2_pin(mfn_t smfn)
+{
+    struct page_info *page;
+    
+    ASSERT(mfn_valid(smfn));
+    page = mfn_to_page(smfn);
+    if ( !(page->count_info & PGC_SH2_pinned) ) 
+    {
+        sh2_get_ref(smfn, 0);
+        page->count_info |= PGC_SH2_pinned;
+    }
+}
+
+/* Unpin a shadow page: unset the pin bit and release the extra ref. */
+static inline void sh2_unpin(struct vcpu *v, mfn_t smfn)
+{
+    struct page_info *page;
+    
+    ASSERT(mfn_valid(smfn));
+    page = mfn_to_page(smfn);
+    if ( page->count_info & PGC_SH2_pinned )
+    {
+        page->count_info &= ~PGC_SH2_pinned;
+        sh2_put_ref(v, smfn, 0);
+    }
+}
+
+/**************************************************************************/
+/* CPU feature support querying */
+
+static inline int
+guest_supports_superpages(struct vcpu *v)
+{
+    return hvm_guest(v) && (hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PSE);
+}
+
+static inline int
+guest_supports_nx(struct vcpu *v)
+{
+    if ( !hvm_guest(v) )
+        return cpu_has_nx;
+
+    // XXX - fix this!
+    return 1;
+}
+
+/**************************************************************************/
+/* Guest physmap (p2m) support */
+
+/* Read our own P2M table, checking in the linear pagetables first to be
+ * sure that we will succeed.  Call this function if you expect it to
+ * fail often, as it avoids page faults.  If you expect to succeed, use
+ * vcpu_gfn_to_mfn, which copy_from_user()s the entry */
+static inline mfn_t
+vcpu_gfn_to_mfn_nofault(struct vcpu *v, unsigned long gfn)
+{
+    unsigned long entry_addr = (unsigned long) &phys_to_machine_mapping[gfn];
+#if CONFIG_PAGING_LEVELS >= 4
+    l4_pgentry_t *l4e;
+    l3_pgentry_t *l3e;
+#endif
+    l2_pgentry_t *l2e;
+    l1_pgentry_t *l1e;
+
+    ASSERT(current == v);
+    if ( !shadow2_vcpu_mode_translate(v) )
+        return _mfn(gfn);
+
+#if CONFIG_PAGING_LEVELS > 2
+    if ( gfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) ) 
+        /* This pfn is higher than the p2m map can hold */
+        return _mfn(INVALID_MFN);
+#endif
+    
+    /* Walk the linear pagetables.  Note that this is *not* the same as 
+     * the walk in sh2_gfn_to_mfn_foreign, which is walking the p2m map */
+#if CONFIG_PAGING_LEVELS >= 4
+    l4e = __linear_l4_table + l4_linear_offset(entry_addr);
+    if ( !(l4e_get_flags(*l4e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
+    l3e = __linear_l3_table + l3_linear_offset(entry_addr);
+    if ( !(l3e_get_flags(*l3e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
+#endif
+    l2e = __linear_l2_table + l2_linear_offset(entry_addr);
+    if ( !(l2e_get_flags(*l2e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
+    l1e = __linear_l1_table + l1_linear_offset(entry_addr);
+    if ( !(l1e_get_flags(*l1e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN);
+
+    /* Safe to look at this part of the table */
+    if ( l1e_get_flags(phys_to_machine_mapping[gfn])  & _PAGE_PRESENT )
+        return _mfn(l1e_get_pfn(phys_to_machine_mapping[gfn]));
+    
+    return _mfn(INVALID_MFN);
+}
+
+
+#endif /* _XEN_SHADOW2_PRIVATE_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-x86/shadow2-types.h b/xen/include/asm-x86/shadow2-types.h
new file mode 100644 (file)
index 0000000..f593c97
--- /dev/null
@@ -0,0 +1,705 @@
+/******************************************************************************
+ * include/asm-x86/shadow2-types.h
+ * 
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _XEN_SHADOW2_TYPES_H
+#define _XEN_SHADOW2_TYPES_H
+
+// Map a shadow page
+static inline void *
+map_shadow_page(mfn_t smfn)
+{
+    // XXX -- Possible optimization/measurement question for 32-bit and PAE
+    //        hypervisors:
+    //        How often is this smfn already available in the shadow linear
+    //        table?  Might it be worth checking that table first,
+    //        presumably using the reverse map hint in the page_info of this
+    //        smfn, rather than calling map_domain_page()?
+    //
+    return sh2_map_domain_page(smfn);
+}
+
+// matching unmap for map_shadow_page()
+static inline void
+unmap_shadow_page(void *p)
+{
+    sh2_unmap_domain_page(p);
+}
+
+/* 
+ * Define various types for handling pagetabels, based on these options:
+ * SHADOW_PAGING_LEVELS : Number of levels of shadow pagetables
+ * GUEST_PAGING_LEVELS  : Number of levels of guest pagetables
+ */
+
+#if (CONFIG_PAGING_LEVELS < SHADOW_PAGING_LEVELS) 
+#error Cannot have more levels of shadow pagetables than host pagetables
+#endif
+
+#if (SHADOW_PAGING_LEVELS < GUEST_PAGING_LEVELS) 
+#error Cannot have more levels of guest pagetables than shadow pagetables
+#endif
+
+#if SHADOW_PAGING_LEVELS == 2
+#define SHADOW_L1_PAGETABLE_ENTRIES    1024
+#define SHADOW_L2_PAGETABLE_ENTRIES    1024
+#define SHADOW_L1_PAGETABLE_SHIFT        12
+#define SHADOW_L2_PAGETABLE_SHIFT        22
+#endif
+
+#if SHADOW_PAGING_LEVELS == 3
+#define SHADOW_L1_PAGETABLE_ENTRIES     512
+#define SHADOW_L2_PAGETABLE_ENTRIES     512
+#define SHADOW_L3_PAGETABLE_ENTRIES       4
+#define SHADOW_L1_PAGETABLE_SHIFT        12
+#define SHADOW_L2_PAGETABLE_SHIFT        21
+#define SHADOW_L3_PAGETABLE_SHIFT        30
+#endif
+
+#if SHADOW_PAGING_LEVELS == 4
+#define SHADOW_L1_PAGETABLE_ENTRIES     512
+#define SHADOW_L2_PAGETABLE_ENTRIES     512
+#define SHADOW_L3_PAGETABLE_ENTRIES     512
+#define SHADOW_L4_PAGETABLE_ENTRIES     512
+#define SHADOW_L1_PAGETABLE_SHIFT        12
+#define SHADOW_L2_PAGETABLE_SHIFT        21
+#define SHADOW_L3_PAGETABLE_SHIFT        30
+#define SHADOW_L4_PAGETABLE_SHIFT        39
+#endif
+
+/* Types of the shadow page tables */
+typedef l1_pgentry_t shadow_l1e_t;
+typedef l2_pgentry_t shadow_l2e_t;
+#if SHADOW_PAGING_LEVELS >= 3
+typedef l3_pgentry_t shadow_l3e_t;
+#if SHADOW_PAGING_LEVELS >= 4
+typedef l4_pgentry_t shadow_l4e_t;
+#endif
+#endif
+
+/* Access functions for them */
+static inline paddr_t shadow_l1e_get_paddr(shadow_l1e_t sl1e)
+{ return l1e_get_paddr(sl1e); }
+static inline paddr_t shadow_l2e_get_paddr(shadow_l2e_t sl2e)
+{ return l2e_get_paddr(sl2e); }
+#if SHADOW_PAGING_LEVELS >= 3
+static inline paddr_t shadow_l3e_get_paddr(shadow_l3e_t sl3e)
+{ return l3e_get_paddr(sl3e); }
+#if SHADOW_PAGING_LEVELS >= 4
+static inline paddr_t shadow_l4e_get_paddr(shadow_l4e_t sl4e)
+{ return l4e_get_paddr(sl4e); }
+#endif
+#endif
+
+static inline mfn_t shadow_l1e_get_mfn(shadow_l1e_t sl1e)
+{ return _mfn(l1e_get_pfn(sl1e)); }
+static inline mfn_t shadow_l2e_get_mfn(shadow_l2e_t sl2e)
+{ return _mfn(l2e_get_pfn(sl2e)); }
+#if SHADOW_PAGING_LEVELS >= 3
+static inline mfn_t shadow_l3e_get_mfn(shadow_l3e_t sl3e)
+{ return _mfn(l3e_get_pfn(sl3e)); }
+#if SHADOW_PAGING_LEVELS >= 4
+static inline mfn_t shadow_l4e_get_mfn(shadow_l4e_t sl4e)
+{ return _mfn(l4e_get_pfn(sl4e)); }
+#endif
+#endif
+
+static inline u32 shadow_l1e_get_flags(shadow_l1e_t sl1e)
+{ return l1e_get_flags(sl1e); }
+static inline u32 shadow_l2e_get_flags(shadow_l2e_t sl2e)
+{ return l2e_get_flags(sl2e); }
+#if SHADOW_PAGING_LEVELS >= 3
+static inline u32 shadow_l3e_get_flags(shadow_l3e_t sl3e)
+{ return l3e_get_flags(sl3e); }
+#if SHADOW_PAGING_LEVELS >= 4
+static inline u32 shadow_l4e_get_flags(shadow_l4e_t sl4e)
+{ return l4e_get_flags(sl4e); }
+#endif
+#endif
+
+static inline shadow_l1e_t
+shadow_l1e_remove_flags(shadow_l1e_t sl1e, u32 flags)
+{ l1e_remove_flags(sl1e, flags); return sl1e; }
+
+static inline shadow_l1e_t shadow_l1e_empty(void) 
+{ return l1e_empty(); }
+static inline shadow_l2e_t shadow_l2e_empty(void) 
+{ return l2e_empty(); }
+#if SHADOW_PAGING_LEVELS >= 3
+static inline shadow_l3e_t shadow_l3e_empty(void) 
+{ return l3e_empty(); }
+#if SHADOW_PAGING_LEVELS >= 4
+static inline shadow_l4e_t shadow_l4e_empty(void) 
+{ return l4e_empty(); }
+#endif
+#endif
+
+static inline shadow_l1e_t shadow_l1e_from_mfn(mfn_t mfn, u32 flags)
+{ return l1e_from_pfn(mfn_x(mfn), flags); }
+static inline shadow_l2e_t shadow_l2e_from_mfn(mfn_t mfn, u32 flags)
+{ return l2e_from_pfn(mfn_x(mfn), flags); }
+#if SHADOW_PAGING_LEVELS >= 3
+static inline shadow_l3e_t shadow_l3e_from_mfn(mfn_t mfn, u32 flags)
+{ return l3e_from_pfn(mfn_x(mfn), flags); }
+#if SHADOW_PAGING_LEVELS >= 4
+static inline shadow_l4e_t shadow_l4e_from_mfn(mfn_t mfn, u32 flags)
+{ return l4e_from_pfn(mfn_x(mfn), flags); }
+#endif
+#endif
+
+#define shadow_l1_table_offset(a) l1_table_offset(a)
+#define shadow_l2_table_offset(a) l2_table_offset(a)
+#define shadow_l3_table_offset(a) l3_table_offset(a)
+#define shadow_l4_table_offset(a) l4_table_offset(a)
+
+/**************************************************************************/
+/* Access to the linear mapping of shadow page tables. */
+
+/* Offsets into each level of the linear mapping for a virtual address. */
+#define shadow_l1_linear_offset(_a)                                           \
+        (((_a) & VADDR_MASK) >> SHADOW_L1_PAGETABLE_SHIFT)
+#define shadow_l2_linear_offset(_a)                                           \
+        (((_a) & VADDR_MASK) >> SHADOW_L2_PAGETABLE_SHIFT)
+#define shadow_l3_linear_offset(_a)                                           \
+        (((_a) & VADDR_MASK) >> SHADOW_L3_PAGETABLE_SHIFT)
+#define shadow_l4_linear_offset(_a)                                           \
+        (((_a) & VADDR_MASK) >> SHADOW_L4_PAGETABLE_SHIFT)
+
+/* Where to find each level of the linear mapping.  For PV guests, we use 
+ * the shadow linear-map self-entry as many times as we need.  For HVM 
+ * guests, the shadow doesn't have a linear-map self-entry so we must use 
+ * the monitor-table's linear-map entry N-1 times and then the shadow-map 
+ * entry once. */
+#define __sh2_linear_l1_table ((shadow_l1e_t *)(SH_LINEAR_PT_VIRT_START))
+#define __sh2_linear_l2_table ((shadow_l2e_t *)                               \
+    (__sh2_linear_l1_table + shadow_l1_linear_offset(SH_LINEAR_PT_VIRT_START)))
+
+// shadow linear L3 and L4 tables only exist in 4 level paging...
+#if SHADOW_PAGING_LEVELS == 4
+#define __sh2_linear_l3_table ((shadow_l3e_t *)                               \
+    (__sh2_linear_l2_table + shadow_l2_linear_offset(SH_LINEAR_PT_VIRT_START)))
+#define __sh2_linear_l4_table ((shadow_l4e_t *)                               \
+    (__sh2_linear_l3_table + shadow_l3_linear_offset(SH_LINEAR_PT_VIRT_START)))
+#endif
+
+#define sh2_linear_l1_table(v) ({ \
+    ASSERT(current == (v)); \
+    __sh2_linear_l1_table; \
+})
+
+#define sh2_linear_l2_table(v) ({ \
+    ASSERT(current == (v)); \
+    ((shadow_l2e_t *) \
+     (hvm_guest(v) ? __linear_l1_table : __sh2_linear_l1_table) + \
+     shadow_l1_linear_offset(SH_LINEAR_PT_VIRT_START)); \
+})
+
+// shadow linear L3 and L4 tables only exist in 4 level paging...
+#if SHADOW_PAGING_LEVELS == 4
+#define sh2_linear_l3_table(v) ({ \
+    ASSERT(current == (v)); \
+    ((shadow_l3e_t *) \
+     (hvm_guest(v) ? __linear_l2_table : __sh2_linear_l2_table) + \
+      shadow_l2_linear_offset(SH_LINEAR_PT_VIRT_START)); \
+})
+
+// we use l4_pgentry_t instead of shadow_l4e_t below because shadow_l4e_t is
+// not defined for when xen_levels==4 & shadow_levels==3...
+#define sh2_linear_l4_table(v) ({ \
+    ASSERT(current == (v)); \
+    ((l4_pgentry_t *) \
+     (hvm_guest(v) ? __linear_l3_table : __sh2_linear_l3_table) + \
+      shadow_l3_linear_offset(SH_LINEAR_PT_VIRT_START)); \
+})
+#endif
+
+#if GUEST_PAGING_LEVELS == 2
+
+#include <asm/page-guest32.h>
+
+#define GUEST_L1_PAGETABLE_ENTRIES     1024
+#define GUEST_L2_PAGETABLE_ENTRIES     1024
+#define GUEST_L1_PAGETABLE_SHIFT         12
+#define GUEST_L2_PAGETABLE_SHIFT         22
+
+/* Type of the guest's frame numbers */
+TYPE_SAFE(u32,gfn)
+#define INVALID_GFN ((u32)(-1u))
+#define SH2_PRI_gfn "05x"
+
+/* Types of the guest's page tables */
+typedef l1_pgentry_32_t guest_l1e_t;
+typedef l2_pgentry_32_t guest_l2e_t;
+
+/* Access functions for them */
+static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e)
+{ return l1e_get_paddr_32(gl1e); }
+static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e)
+{ return l2e_get_paddr_32(gl2e); }
+
+static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e)
+{ return _gfn(l1e_get_paddr_32(gl1e) >> PAGE_SHIFT); }
+static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e)
+{ return _gfn(l2e_get_paddr_32(gl2e) >> PAGE_SHIFT); }
+
+static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e)
+{ return l1e_get_flags_32(gl1e); }
+static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e)
+{ return l2e_get_flags_32(gl2e); }
+
+static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags)
+{ l1e_add_flags_32(gl1e, flags); return gl1e; }
+static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags)
+{ l2e_add_flags_32(gl2e, flags); return gl2e; }
+
+static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags)
+{ return l1e_from_pfn_32(gfn_x(gfn), flags); }
+static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags)
+{ return l2e_from_pfn_32(gfn_x(gfn), flags); }
+
+#define guest_l1_table_offset(a) l1_table_offset_32(a)
+#define guest_l2_table_offset(a) l2_table_offset_32(a)
+
+/* The shadow types needed for the various levels. */
+#define PGC_SH2_l1_shadow  PGC_SH2_l1_32_shadow
+#define PGC_SH2_l2_shadow  PGC_SH2_l2_32_shadow
+#define PGC_SH2_fl1_shadow PGC_SH2_fl1_32_shadow
+
+#else /* GUEST_PAGING_LEVELS != 2 */
+
+#if GUEST_PAGING_LEVELS == 3
+#define GUEST_L1_PAGETABLE_ENTRIES      512
+#define GUEST_L2_PAGETABLE_ENTRIES      512
+#define GUEST_L3_PAGETABLE_ENTRIES        4
+#define GUEST_L1_PAGETABLE_SHIFT         12
+#define GUEST_L2_PAGETABLE_SHIFT         21
+#define GUEST_L3_PAGETABLE_SHIFT         30
+#else /* GUEST_PAGING_LEVELS == 4 */
+#define GUEST_L1_PAGETABLE_ENTRIES      512
+#define GUEST_L2_PAGETABLE_ENTRIES      512
+#define GUEST_L3_PAGETABLE_ENTRIES      512
+#define GUEST_L4_PAGETABLE_ENTRIES      512
+#define GUEST_L1_PAGETABLE_SHIFT         12
+#define GUEST_L2_PAGETABLE_SHIFT         21
+#define GUEST_L3_PAGETABLE_SHIFT         30
+#define GUEST_L4_PAGETABLE_SHIFT         39
+#endif
+
+/* Type of the guest's frame numbers */
+TYPE_SAFE(unsigned long,gfn)
+#define INVALID_GFN ((unsigned long)(-1ul))
+#define SH2_PRI_gfn "05lx"
+
+/* Types of the guest's page tables */
+typedef l1_pgentry_t guest_l1e_t;
+typedef l2_pgentry_t guest_l2e_t;
+typedef l3_pgentry_t guest_l3e_t;
+#if GUEST_PAGING_LEVELS >= 4
+typedef l4_pgentry_t guest_l4e_t;
+#endif
+
+/* Access functions for them */
+static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e)
+{ return l1e_get_paddr(gl1e); }
+static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e)
+{ return l2e_get_paddr(gl2e); }
+static inline paddr_t guest_l3e_get_paddr(guest_l3e_t gl3e)
+{ return l3e_get_paddr(gl3e); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline paddr_t guest_l4e_get_paddr(guest_l4e_t gl4e)
+{ return l4e_get_paddr(gl4e); }
+#endif
+
+static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e)
+{ return _gfn(l1e_get_paddr(gl1e) >> PAGE_SHIFT); }
+static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e)
+{ return _gfn(l2e_get_paddr(gl2e) >> PAGE_SHIFT); }
+static inline gfn_t guest_l3e_get_gfn(guest_l3e_t gl3e)
+{ return _gfn(l3e_get_paddr(gl3e) >> PAGE_SHIFT); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline gfn_t guest_l4e_get_gfn(guest_l4e_t gl4e)
+{ return _gfn(l4e_get_paddr(gl4e) >> PAGE_SHIFT); }
+#endif
+
+static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e)
+{ return l1e_get_flags(gl1e); }
+static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e)
+{ return l2e_get_flags(gl2e); }
+static inline u32 guest_l3e_get_flags(guest_l3e_t gl3e)
+{ return l3e_get_flags(gl3e); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline u32 guest_l4e_get_flags(guest_l4e_t gl4e)
+{ return l4e_get_flags(gl4e); }
+#endif
+
+static inline guest_l1e_t guest_l1e_add_flags(guest_l1e_t gl1e, u32 flags)
+{ l1e_add_flags(gl1e, flags); return gl1e; }
+static inline guest_l2e_t guest_l2e_add_flags(guest_l2e_t gl2e, u32 flags)
+{ l2e_add_flags(gl2e, flags); return gl2e; }
+static inline guest_l3e_t guest_l3e_add_flags(guest_l3e_t gl3e, u32 flags)
+{ l3e_add_flags(gl3e, flags); return gl3e; }
+#if GUEST_PAGING_LEVELS >= 4
+static inline guest_l4e_t guest_l4e_add_flags(guest_l4e_t gl4e, u32 flags)
+{ l4e_add_flags(gl4e, flags); return gl4e; }
+#endif
+
+static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags)
+{ return l1e_from_pfn(gfn_x(gfn), flags); }
+static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags)
+{ return l2e_from_pfn(gfn_x(gfn), flags); }
+static inline guest_l3e_t guest_l3e_from_gfn(gfn_t gfn, u32 flags)
+{ return l3e_from_pfn(gfn_x(gfn), flags); }
+#if GUEST_PAGING_LEVELS >= 4
+static inline guest_l4e_t guest_l4e_from_gfn(gfn_t gfn, u32 flags)
+{ return l4e_from_pfn(gfn_x(gfn), flags); }
+#endif
+
+#define guest_l1_table_offset(a) l1_table_offset(a)
+#define guest_l2_table_offset(a) l2_table_offset(a)
+#define guest_l3_table_offset(a) l3_table_offset(a)
+#define guest_l4_table_offset(a) l4_table_offset(a)
+
+/* The shadow types needed for the various levels. */
+#if GUEST_PAGING_LEVELS == 3
+#define PGC_SH2_l1_shadow  PGC_SH2_l1_pae_shadow
+#define PGC_SH2_fl1_shadow PGC_SH2_fl1_pae_shadow
+#define PGC_SH2_l2_shadow  PGC_SH2_l2_pae_shadow
+#define PGC_SH2_l2h_shadow PGC_SH2_l2h_pae_shadow
+#define PGC_SH2_l3_shadow  PGC_SH2_l3_pae_shadow
+#else
+#define PGC_SH2_l1_shadow  PGC_SH2_l1_64_shadow
+#define PGC_SH2_fl1_shadow PGC_SH2_fl1_64_shadow
+#define PGC_SH2_l2_shadow  PGC_SH2_l2_64_shadow
+#define PGC_SH2_l3_shadow  PGC_SH2_l3_64_shadow
+#define PGC_SH2_l4_shadow  PGC_SH2_l4_64_shadow
+#endif
+
+#endif /* GUEST_PAGING_LEVELS != 2 */
+
+#define VALID_GFN(m) (m != INVALID_GFN)
+
+static inline int
+valid_gfn(gfn_t m)
+{
+    return VALID_GFN(gfn_x(m));
+}
+
+#if GUEST_PAGING_LEVELS == 2
+#define PGC_SH2_guest_root_type PGC_SH2_l2_32_shadow
+#elif GUEST_PAGING_LEVELS == 3
+#define PGC_SH2_guest_root_type PGC_SH2_l3_pae_shadow
+#else
+#define PGC_SH2_guest_root_type PGC_SH2_l4_64_shadow
+#endif
+
+/* Translation between mfns and gfns */
+static inline mfn_t
+vcpu_gfn_to_mfn(struct vcpu *v, gfn_t gfn)
+{
+    return sh2_vcpu_gfn_to_mfn(v, gfn_x(gfn));
+} 
+
+static inline gfn_t
+mfn_to_gfn(struct domain *d, mfn_t mfn)
+{
+    return _gfn(sh2_mfn_to_gfn(d, mfn));
+}
+
+static inline paddr_t
+gfn_to_paddr(gfn_t gfn)
+{
+    return ((paddr_t)gfn_x(gfn)) << PAGE_SHIFT;
+}
+
+/* Type used for recording a walk through guest pagetables.  It is
+ * filled in by the pagetable walk function, and also used as a cache
+ * for later walks.  
+ * Any non-null pointer in this structure represents a mapping of guest
+ * memory.  We must always call walk_init() before using a walk_t, and 
+ * call walk_unmap() when we're done. 
+ * The "Effective l1e" field is used when there isn't an l1e to point to, 
+ * but we have fabricated an l1e for propagation to the shadow (e.g., 
+ * for splintering guest superpages into many shadow l1 entries).  */
+typedef struct shadow2_walk_t walk_t;
+struct shadow2_walk_t 
+{
+    unsigned long va;           /* Address we were looking for */
+#if GUEST_PAGING_LEVELS >= 3
+#if GUEST_PAGING_LEVELS >= 4
+    guest_l4e_t *l4e;           /* Pointer to guest's level 4 entry */
+#endif
+    guest_l3e_t *l3e;           /* Pointer to guest's level 3 entry */
+#endif
+    guest_l2e_t *l2e;           /* Pointer to guest's level 2 entry */
+    guest_l1e_t *l1e;           /* Pointer to guest's level 1 entry */
+    guest_l1e_t eff_l1e;        /* Effective level 1 entry */
+#if GUEST_PAGING_LEVELS >= 3
+#if GUEST_PAGING_LEVELS >= 4
+    mfn_t l4mfn;                /* MFN that the level 4 entry is in */
+#endif
+    mfn_t l3mfn;                /* MFN that the level 3 entry is in */
+#endif
+    mfn_t l2mfn;                /* MFN that the level 2 entry is in */
+    mfn_t l1mfn;                /* MFN that the level 1 entry is in */
+};
+
+
+/* X86 error code bits:
+ * These bits certainly ought to be defined somewhere other than here,
+ * but until that place is determined, here they sit.
+ *
+ * "PFEC" == "Page Fault Error Code"
+ */
+#define X86_PFEC_PRESENT            1  /* 0 == page was not present */
+#define X86_PFEC_WRITE_FAULT        2  /* 0 == reading, 1 == writing */
+#define X86_PFEC_SUPERVISOR_FAULT   4  /* 0 == supervisor-mode, 1 == user */
+#define X86_PFEC_RESERVED_BIT_FAULT 8  /* 1 == reserved bits set in pte */
+#define X86_PFEC_INSN_FETCH_FAULT  16  /* 0 == normal, 1 == instr'n fetch */
+
+/* macros for dealing with the naming of the internal function names of the
+ * shadow code's external entry points.
+ */
+#define INTERNAL_NAME(name) \
+    SHADOW2_INTERNAL_NAME(name, SHADOW_PAGING_LEVELS, GUEST_PAGING_LEVELS)
+
+/* macros for renaming the primary entry points, so that they are more
+ * easily distinguished from a debugger
+ */
+#define sh2_page_fault              INTERNAL_NAME(sh2_page_fault)
+#define sh2_invlpg                  INTERNAL_NAME(sh2_invlpg)
+#define sh2_gva_to_gpa              INTERNAL_NAME(sh2_gva_to_gpa)
+#define sh2_gva_to_gfn              INTERNAL_NAME(sh2_gva_to_gfn)
+#define sh2_update_cr3              INTERNAL_NAME(sh2_update_cr3)
+#define sh2_remove_write_access     INTERNAL_NAME(sh2_remove_write_access)
+#define sh2_remove_all_mappings     INTERNAL_NAME(sh2_remove_all_mappings)
+#define sh2_remove_l1_shadow        INTERNAL_NAME(sh2_remove_l1_shadow)
+#define sh2_remove_l2_shadow        INTERNAL_NAME(sh2_remove_l2_shadow)
+#define sh2_remove_l3_shadow        INTERNAL_NAME(sh2_remove_l3_shadow)
+#define sh2_map_and_validate_gl4e   INTERNAL_NAME(sh2_map_and_validate_gl4e)
+#define sh2_map_and_validate_gl3e   INTERNAL_NAME(sh2_map_and_validate_gl3e)
+#define sh2_map_and_validate_gl2e   INTERNAL_NAME(sh2_map_and_validate_gl2e)
+#define sh2_map_and_validate_gl2he  INTERNAL_NAME(sh2_map_and_validate_gl2he)
+#define sh2_map_and_validate_gl1e   INTERNAL_NAME(sh2_map_and_validate_gl1e)
+#define sh2_destroy_l4_shadow       INTERNAL_NAME(sh2_destroy_l4_shadow)
+#define sh2_destroy_l3_shadow       INTERNAL_NAME(sh2_destroy_l3_shadow)
+#define sh2_destroy_l3_subshadow    INTERNAL_NAME(sh2_destroy_l3_subshadow)
+#define sh2_unpin_all_l3_subshadows INTERNAL_NAME(sh2_unpin_all_l3_subshadows)
+#define sh2_destroy_l2_shadow       INTERNAL_NAME(sh2_destroy_l2_shadow)
+#define sh2_destroy_l1_shadow       INTERNAL_NAME(sh2_destroy_l1_shadow)
+#define sh2_unhook_32b_mappings     INTERNAL_NAME(sh2_unhook_32b_mappings)
+#define sh2_unhook_pae_mappings     INTERNAL_NAME(sh2_unhook_pae_mappings)
+#define sh2_unhook_64b_mappings     INTERNAL_NAME(sh2_unhook_64b_mappings)
+#define shadow2_entry               INTERNAL_NAME(shadow2_entry)
+#define sh2_detach_old_tables       INTERNAL_NAME(sh2_detach_old_tables)
+#define sh2_x86_emulate_write       INTERNAL_NAME(sh2_x86_emulate_write)
+#define sh2_x86_emulate_cmpxchg     INTERNAL_NAME(sh2_x86_emulate_cmpxchg)
+#define sh2_x86_emulate_cmpxchg8b   INTERNAL_NAME(sh2_x86_emulate_cmpxchg8b)
+#define sh2_audit_l1_table          INTERNAL_NAME(sh2_audit_l1_table)
+#define sh2_audit_fl1_table         INTERNAL_NAME(sh2_audit_fl1_table)
+#define sh2_audit_l2_table          INTERNAL_NAME(sh2_audit_l2_table)
+#define sh2_audit_l3_table          INTERNAL_NAME(sh2_audit_l3_table)
+#define sh2_audit_l4_table          INTERNAL_NAME(sh2_audit_l4_table)
+#define sh2_guess_wrmap             INTERNAL_NAME(sh2_guess_wrmap)
+#define sh2_clear_shadow_entry      INTERNAL_NAME(sh2_clear_shadow_entry)
+
+/* sh2_make_monitor_table only depends on the number of shadow levels */
+#define sh2_make_monitor_table                          \
+        SHADOW2_INTERNAL_NAME(sh2_make_monitor_table,   \
+                              SHADOW_PAGING_LEVELS,     \
+                              SHADOW_PAGING_LEVELS)
+#define sh2_destroy_monitor_table                               \
+        SHADOW2_INTERNAL_NAME(sh2_destroy_monitor_table,        \
+                              SHADOW_PAGING_LEVELS,             \
+                              SHADOW_PAGING_LEVELS)
+
+
+#if GUEST_PAGING_LEVELS == 3
+/*
+ * Accounting information stored in the shadow of PAE Guest L3 pages.
+ * Because these "L3 pages" are only 32-bytes, it is inconvenient to keep
+ * various refcounts, etc., on the page_info of their page.  We provide extra
+ * bookkeeping space in the shadow itself, and this is the structure
+ * definition for that bookkeeping information.
+ */
+struct pae_l3_bookkeeping {
+    u32 vcpus;                  /* bitmap of which vcpus are currently storing
+                                 * copies of this 32-byte page */
+    u32 refcount;               /* refcount for this 32-byte page */
+    u8 pinned;                  /* is this 32-byte page pinned or not? */
+};
+
+// Convert a shadow entry pointer into a pae_l3_bookkeeping pointer.
+#define sl3p_to_info(_ptr) ((struct pae_l3_bookkeeping *)         \
+                            (((unsigned long)(_ptr) & ~31) + 32))
+
+static void sh2_destroy_l3_subshadow(struct vcpu *v, 
+                                     shadow_l3e_t *sl3e);
+
+/* Increment a subshadow ref
+ * Called with a pointer to the subshadow, and the mfn of the
+ * *first* page of the overall shadow. */
+static inline void sh2_get_ref_l3_subshadow(shadow_l3e_t *sl3e, mfn_t smfn)
+{
+    struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e);
+
+    /* First ref to the subshadow takes a ref to the full shadow */
+    if ( bk->refcount == 0 ) 
+        sh2_get_ref(smfn, 0);
+    if ( unlikely(++(bk->refcount) == 0) )
+    {
+        SHADOW2_PRINTK("shadow l3 subshadow ref overflow, smfn=%" SH2_PRI_mfn " sh=%p\n", 
+                       mfn_x(smfn), sl3e);
+        domain_crash_synchronous();
+    }
+}
+
+/* Decrement a subshadow ref.
+ * Called with a pointer to the subshadow, and the mfn of the
+ * *first* page of the overall shadow.  Calling this may cause the 
+ * entire shadow to disappear, so the caller must immediately unmap 
+ * the pointer after calling. */ 
+static inline void sh2_put_ref_l3_subshadow(struct vcpu *v, 
+                                            shadow_l3e_t *sl3e,
+                                            mfn_t smfn)
+{
+    struct pae_l3_bookkeeping *bk;
+
+    bk = sl3p_to_info(sl3e);
+
+    ASSERT(bk->refcount > 0);
+    if ( --(bk->refcount) == 0 )
+    {
+        /* Need to destroy this subshadow */
+        sh2_destroy_l3_subshadow(v, sl3e);
+        /* Last ref to the subshadow had a ref to the full shadow */
+        sh2_put_ref(v, smfn, 0);
+    }
+}
+
+/* Pin a subshadow 
+ * Called with a pointer to the subshadow, and the mfn of the
+ * *first* page of the overall shadow. */
+static inline void sh2_pin_l3_subshadow(shadow_l3e_t *sl3e, mfn_t smfn)
+{
+    struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e);
+
+#if 0
+    debugtrace_printk("%s smfn=%05lx offset=%ld\n",
+                      __func__, mfn_x(smfn),
+                      ((unsigned long)sl3e & ~PAGE_MASK) / 64);
+#endif
+
+    if ( !bk->pinned )
+    {
+        bk->pinned = 1;
+        sh2_get_ref_l3_subshadow(sl3e, smfn);
+    }
+}
+
+/* Unpin a sub-shadow. 
+ * Called with a pointer to the subshadow, and the mfn of the
+ * *first* page of the overall shadow.  Calling this may cause the 
+ * entire shadow to disappear, so the caller must immediately unmap 
+ * the pointer after calling. */ 
+static inline void sh2_unpin_l3_subshadow(struct vcpu *v, 
+                                          shadow_l3e_t *sl3e,
+                                          mfn_t smfn)
+{
+    struct pae_l3_bookkeeping *bk = sl3p_to_info(sl3e);
+
+#if 0
+    debugtrace_printk("%s smfn=%05lx offset=%ld\n",
+                      __func__, mfn_x(smfn),
+                      ((unsigned long)sl3e & ~PAGE_MASK) / 64);
+#endif
+
+    if ( bk->pinned )
+    {
+        bk->pinned = 0;
+        sh2_put_ref_l3_subshadow(v, sl3e, smfn);
+    }
+}
+
+#endif /* GUEST_PAGING_LEVELS == 3 */
+
+#if SHADOW_PAGING_LEVELS == 3
+#define MFN_FITS_IN_HVM_CR3(_MFN) !(mfn_x(_MFN) >> 20)
+#endif
+
+#if SHADOW_PAGING_LEVELS == 2
+#define SH2_PRI_pte "08x"
+#else /* SHADOW_PAGING_LEVELS >= 3 */
+#ifndef __x86_64__
+#define SH2_PRI_pte "016llx"
+#else
+#define SH2_PRI_pte "016lx"
+#endif
+#endif /* SHADOW_PAGING_LEVELS >= 3 */
+
+#if GUEST_PAGING_LEVELS == 2
+#define SH2_PRI_gpte "08x"
+#else /* GUEST_PAGING_LEVELS >= 3 */
+#ifndef __x86_64__
+#define SH2_PRI_gpte "016llx"
+#else
+#define SH2_PRI_gpte "016lx"
+#endif
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
+static inline u32
+accumulate_guest_flags(walk_t *gw)
+{
+    u32 accumulated_flags;
+
+    // We accumulate the permission flags with bitwise ANDing.
+    // This works for the PRESENT bit, RW bit, and USER bit.
+    // For the NX bit, however, the polarity is wrong, so we accumulate the
+    // inverse of the NX bit.
+    //
+    accumulated_flags =  guest_l1e_get_flags(gw->eff_l1e) ^ _PAGE_NX_BIT;
+    accumulated_flags &= guest_l2e_get_flags(*gw->l2e) ^ _PAGE_NX_BIT;
+
+    // Note that PAE guests do not have USER or RW or NX bits in their L3s.
+    //
+#if GUEST_PAGING_LEVELS == 3
+    accumulated_flags &=
+        ~_PAGE_PRESENT | (guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT);
+#elif GUEST_PAGING_LEVELS >= 4
+    accumulated_flags &= guest_l3e_get_flags(*gw->l3e) ^ _PAGE_NX_BIT;
+    accumulated_flags &= guest_l4e_get_flags(*gw->l4e) ^ _PAGE_NX_BIT;
+#endif
+
+    // Finally, revert the NX bit back to its original polarity
+    accumulated_flags ^= _PAGE_NX_BIT;
+
+    return accumulated_flags;
+}
+
+#endif /* _XEN_SHADOW2_TYPES_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-x86/shadow2.h b/xen/include/asm-x86/shadow2.h
new file mode 100644 (file)
index 0000000..94de778
--- /dev/null
@@ -0,0 +1,627 @@
+/******************************************************************************
+ * include/asm-x86/shadow2.h
+ * 
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _XEN_SHADOW2_H
+#define _XEN_SHADOW2_H
+
+#include <public/dom0_ops.h> 
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <asm/flushtlb.h>
+
+/* Shadow PT operation mode : shadow-mode variable in arch_domain. */
+
+#define SHM2_shift 10
+/* We're in one of the shadow modes */
+#define SHM2_enable    (DOM0_SHADOW2_CONTROL_FLAG_ENABLE << SHM2_shift)
+/* Refcounts based on shadow tables instead of guest tables */
+#define SHM2_refcounts (DOM0_SHADOW2_CONTROL_FLAG_REFCOUNT << SHM2_shift)
+/* Enable log dirty mode */
+#define SHM2_log_dirty (DOM0_SHADOW2_CONTROL_FLAG_LOG_DIRTY << SHM2_shift)
+/* Xen does p2m translation, not guest */
+#define SHM2_translate (DOM0_SHADOW2_CONTROL_FLAG_TRANSLATE << SHM2_shift)
+/* Xen does not steal address space from the domain for its own booking;
+ * requires VT or similar mechanisms */
+#define SHM2_external  (DOM0_SHADOW2_CONTROL_FLAG_EXTERNAL << SHM2_shift)
+
+#define shadow2_mode_enabled(_d)   ((_d)->arch.shadow2_mode)
+#define shadow2_mode_refcounts(_d) ((_d)->arch.shadow2_mode & SHM2_refcounts)
+#define shadow2_mode_log_dirty(_d) ((_d)->arch.shadow2_mode & SHM2_log_dirty)
+#define shadow2_mode_translate(_d) ((_d)->arch.shadow2_mode & SHM2_translate)
+#define shadow2_mode_external(_d)  ((_d)->arch.shadow2_mode & SHM2_external)
+
+/* Xen traps & emulates all reads of all page table pages:
+ *not yet supported
+ */
+#define shadow2_mode_trap_reads(_d) ({ (void)(_d); 0; })
+
+// flags used in the return value of the shadow_set_lXe() functions...
+#define SHADOW2_SET_CHANGED            0x1
+#define SHADOW2_SET_FLUSH              0x2
+#define SHADOW2_SET_ERROR              0x4
+#define SHADOW2_SET_L3PAE_RECOPY       0x8
+
+// How do we tell that we have a 32-bit PV guest in a 64-bit Xen?
+#ifdef __x86_64__
+#define pv_32bit_guest(_v) 0 // not yet supported
+#else
+#define pv_32bit_guest(_v) !hvm_guest(v)
+#endif
+
+/* The shadow2 lock.
+ *
+ * This lock is per-domain.  It is intended to allow us to make atomic
+ * updates to the software TLB that the shadow tables provide.
+ * 
+ * Specifically, it protects:
+ *   - all changes to shadow page table pages
+ *   - the shadow hash table
+ *   - the shadow page allocator 
+ *   - all changes to guest page table pages; if/when the notion of
+ *     out-of-sync pages is added to this code, then the shadow lock is
+ *     protecting all guest page table pages which are not listed as
+ *     currently as both guest-writable and out-of-sync...
+ *     XXX -- need to think about this relative to writable page tables.
+ *   - all changes to the page_info->tlbflush_timestamp
+ *   - the page_info->count fields on shadow pages
+ *   - the shadow dirty bit array and count
+ *   - XXX
+ */
+#ifndef CONFIG_SMP
+#error shadow2.h currently requires CONFIG_SMP
+#endif
+
+#define shadow2_lock_init(_d)                                   \
+    do {                                                        \
+        spin_lock_init(&(_d)->arch.shadow2_lock);               \
+        (_d)->arch.shadow2_locker = -1;                         \
+        (_d)->arch.shadow2_locker_function = "nobody";          \
+    } while (0)
+
+#define shadow2_lock_is_acquired(_d)                            \
+    (current->processor == (_d)->arch.shadow2_locker)
+
+#define shadow2_lock(_d)                                                 \
+    do {                                                                 \
+        if ( unlikely((_d)->arch.shadow2_locker == current->processor) ) \
+        {                                                                \
+            printk("Error: shadow2 lock held by %s\n",                   \
+                   (_d)->arch.shadow2_locker_function);                  \
+            BUG();                                                       \
+        }                                                                \
+        spin_lock(&(_d)->arch.shadow2_lock);                             \
+        ASSERT((_d)->arch.shadow2_locker == -1);                         \
+        (_d)->arch.shadow2_locker = current->processor;                  \
+        (_d)->arch.shadow2_locker_function = __func__;                   \
+    } while (0)
+
+#define shadow2_unlock(_d)                                              \
+    do {                                                                \
+        ASSERT((_d)->arch.shadow2_locker == current->processor);        \
+        (_d)->arch.shadow2_locker = -1;                                 \
+        (_d)->arch.shadow2_locker_function = "nobody";                  \
+        spin_unlock(&(_d)->arch.shadow2_lock);                          \
+    } while (0)
+
+/* 
+ * Levels of self-test and paranoia
+ * XXX should go in config files somewhere?  
+ */
+#define SHADOW2_AUDIT_HASH           0x01  /* Check current hash bucket */
+#define SHADOW2_AUDIT_HASH_FULL      0x02  /* Check every hash bucket */
+#define SHADOW2_AUDIT_ENTRIES        0x04  /* Check this walk's shadows */
+#define SHADOW2_AUDIT_ENTRIES_FULL   0x08  /* Check every shadow */
+#define SHADOW2_AUDIT_ENTRIES_MFNS   0x10  /* Check gfn-mfn map in shadows */
+#define SHADOW2_AUDIT_P2M            0x20  /* Check the p2m table */
+
+#ifdef NDEBUG
+#define SHADOW2_AUDIT                   0
+#define SHADOW2_AUDIT_ENABLE            0
+#else
+#define SHADOW2_AUDIT                0x15  /* Basic audit of all except p2m. */
+#define SHADOW2_AUDIT_ENABLE         shadow2_audit_enable
+extern int shadow2_audit_enable;
+#endif
+
+/* 
+ * Levels of optimization
+ * XXX should go in config files somewhere?  
+ */
+#define SH2OPT_WRITABLE_HEURISTIC  0x01  /* Guess at RW PTEs via linear maps */
+#define SH2OPT_EARLY_UNSHADOW      0x02  /* Unshadow l1s on fork or exit */
+
+#define SHADOW2_OPTIMIZATIONS      0x03
+
+
+/* With shadow pagetables, the different kinds of address start 
+ * to get get confusing.
+ * 
+ * Virtual addresses are what they usually are: the addresses that are used 
+ * to accessing memory while the guest is running.  The MMU translates from 
+ * virtual addresses to machine addresses. 
+ * 
+ * (Pseudo-)physical addresses are the abstraction of physical memory the
+ * guest uses for allocation and so forth.  For the purposes of this code, 
+ * we can largely ignore them.
+ *
+ * Guest frame numbers (gfns) are the entries that the guest puts in its
+ * pagetables.  For normal paravirtual guests, they are actual frame numbers,
+ * with the translation done by the guest.  
+ * 
+ * Machine frame numbers (mfns) are the entries that the hypervisor puts
+ * in the shadow page tables.
+ *
+ * Elsewhere in the xen code base, the name "gmfn" is generally used to refer
+ * to a "machine frame number, from the guest's perspective", or in other
+ * words, pseudo-physical frame numbers.  However, in the shadow code, the
+ * term "gmfn" means "the mfn of a guest page"; this combines naturally with
+ * other terms such as "smfn" (the mfn of a shadow page), gl2mfn (the mfn of a
+ * guest L2 page), etc...
+ */
+
+/* With this defined, we do some ugly things to force the compiler to
+ * give us type safety between mfns and gfns and other integers.
+ * TYPE_SAFE(int foo) defines a foo_t, and _foo() and foo_x() functions 
+ * that translate beween int and foo_t.
+ * 
+ * It does have some performance cost because the types now have 
+ * a different storage attribute, so may not want it on all the time. */
+#ifndef NDEBUG
+#define TYPE_SAFETY 1
+#endif
+
+#ifdef TYPE_SAFETY
+#define TYPE_SAFE(_type,_name)                                  \
+typedef struct { _type _name; } _name##_t;                      \
+static inline _name##_t _##_name(_type n) { return (_name##_t) { n }; } \
+static inline _type _name##_x(_name##_t n) { return n._name; }
+#else
+#define TYPE_SAFE(_type,_name)                                          \
+typedef _type _name##_t;                                                \
+static inline _name##_t _##_name(_type n) { return n; }                 \
+static inline _type _name##_x(_name##_t n) { return n; }
+#endif
+
+TYPE_SAFE(unsigned long,mfn)
+#define SH2_PRI_mfn "05lx"
+
+static inline int
+valid_mfn(mfn_t m)
+{
+    return VALID_MFN(mfn_x(m));
+}
+
+static inline mfn_t
+pagetable_get_mfn(pagetable_t pt)
+{
+    return _mfn(pagetable_get_pfn(pt));
+}
+
+static inline pagetable_t
+pagetable_from_mfn(mfn_t mfn)
+{
+    return pagetable_from_pfn(mfn_x(mfn));
+}
+
+static inline int
+shadow2_vcpu_mode_translate(struct vcpu *v)
+{
+    // Returns true if this VCPU needs to be using the P2M table to translate
+    // between GFNs and MFNs.
+    //
+    // This is true of translated HVM domains on a vcpu which has paging
+    // enabled.  (HVM vcpu's with paging disabled are using the p2m table as
+    // its paging table, so no translation occurs in this case.)
+    //
+    return v->vcpu_flags & VCPUF_shadow2_translate;
+}
+
+
+/**************************************************************************/
+/* Mode-specific entry points into the shadow code */
+
+struct x86_emulate_ctxt;
+struct shadow2_entry_points {
+    int           (*page_fault            )(struct vcpu *v, unsigned long va,
+                                            struct cpu_user_regs *regs);
+    int           (*invlpg                )(struct vcpu *v, unsigned long va);
+    unsigned long (*gva_to_gpa            )(struct vcpu *v, unsigned long va);
+    unsigned long (*gva_to_gfn            )(struct vcpu *v, unsigned long va);
+    void          (*update_cr3            )(struct vcpu *v);
+    int           (*map_and_validate_gl1e )(struct vcpu *v, mfn_t gmfn,
+                                            void *new_guest_entry, u32 size);
+    int           (*map_and_validate_gl2e )(struct vcpu *v, mfn_t gmfn,
+                                            void *new_guest_entry, u32 size);
+    int           (*map_and_validate_gl2he)(struct vcpu *v, mfn_t gmfn,
+                                            void *new_guest_entry, u32 size);
+    int           (*map_and_validate_gl3e )(struct vcpu *v, mfn_t gmfn,
+                                            void *new_guest_entry, u32 size);
+    int           (*map_and_validate_gl4e )(struct vcpu *v, mfn_t gmfn,
+                                            void *new_guest_entry, u32 size);
+    void          (*detach_old_tables     )(struct vcpu *v);
+    int           (*x86_emulate_write     )(struct vcpu *v, unsigned long va,
+                                            void *src, u32 bytes,
+                                            struct x86_emulate_ctxt *ctxt);
+    int           (*x86_emulate_cmpxchg   )(struct vcpu *v, unsigned long va,
+                                            unsigned long old, 
+                                            unsigned long new,
+                                            unsigned int bytes,
+                                            struct x86_emulate_ctxt *ctxt);
+    int           (*x86_emulate_cmpxchg8b )(struct vcpu *v, unsigned long va,
+                                            unsigned long old_lo, 
+                                            unsigned long old_hi, 
+                                            unsigned long new_lo,
+                                            unsigned long new_hi,
+                                            struct x86_emulate_ctxt *ctxt);
+    mfn_t         (*make_monitor_table    )(struct vcpu *v);
+    void          (*destroy_monitor_table )(struct vcpu *v, mfn_t mmfn);
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC
+    int           (*guess_wrmap           )(struct vcpu *v, 
+                                            unsigned long vaddr, mfn_t gmfn);
+#endif
+    /* For outsiders to tell what mode we're in */
+    unsigned int shadow_levels;
+    unsigned int guest_levels;
+};
+
+static inline int shadow2_guest_paging_levels(struct vcpu *v)
+{
+    ASSERT(v->arch.shadow2 != NULL);
+    return v->arch.shadow2->guest_levels;
+}
+
+/**************************************************************************/
+/* Entry points into the shadow code */
+
+/* Turning on shadow2 test mode */
+int shadow2_test_enable(struct domain *d);
+
+/* Handler for shadow control ops: enabling and disabling shadow modes, 
+ * and log-dirty bitmap ops all happen through here. */
+int shadow2_control_op(struct domain *d, 
+                       dom0_shadow_control_t *sc,
+                       XEN_GUEST_HANDLE(dom0_op_t) u_dom0_op);
+
+/* Call when destroying a domain */
+void shadow2_teardown(struct domain *d);
+
+/* Call once all of the references to the domain have gone away */
+void shadow2_final_teardown(struct domain *d);
+
+
+/* Mark a page as dirty in the bitmap */
+void sh2_do_mark_dirty(struct domain *d, mfn_t gmfn);
+static inline void mark_dirty(struct domain *d, unsigned long gmfn)
+{
+    if ( shadow2_mode_log_dirty(d) )
+    {
+        shadow2_lock(d);
+        sh2_do_mark_dirty(d, _mfn(gmfn));
+        shadow2_unlock(d);
+    }
+}
+
+/* Internal version, for when the shadow lock is already held */
+static inline void sh2_mark_dirty(struct domain *d, mfn_t gmfn)
+{
+    ASSERT(shadow2_lock_is_acquired(d));
+    if ( shadow2_mode_log_dirty(d) )
+        sh2_do_mark_dirty(d, gmfn);
+}
+
+static inline int
+shadow2_fault(unsigned long va, struct cpu_user_regs *regs)
+/* Called from pagefault handler in Xen, and from the HVM trap handlers
+ * for pagefaults.  Returns 1 if this fault was an artefact of the
+ * shadow code (and the guest should retry) or 0 if it is not (and the
+ * fault should be handled elsewhere or passed to the guest). */
+{
+    struct vcpu *v = current;
+    perfc_incrc(shadow2_fault);
+    return v->arch.shadow2->page_fault(v, va, regs);
+}
+
+static inline int
+shadow2_invlpg(struct vcpu *v, unsigned long va)
+/* Called when the guest requests an invlpg.  Returns 1 if the invlpg
+ * instruction should be issued on the hardware, or 0 if it's safe not
+ * to do so. */
+{
+    return v->arch.shadow2->invlpg(v, va);
+}
+
+static inline unsigned long
+shadow2_gva_to_gpa(struct vcpu *v, unsigned long va)
+/* Called to translate a guest virtual address to what the *guest*
+ * pagetables would map it to. */
+{
+    return v->arch.shadow2->gva_to_gpa(v, va);
+}
+
+static inline unsigned long
+shadow2_gva_to_gfn(struct vcpu *v, unsigned long va)
+/* Called to translate a guest virtual address to what the *guest*
+ * pagetables would map it to. */
+{
+    return v->arch.shadow2->gva_to_gfn(v, va);
+}
+
+static inline void
+shadow2_update_cr3(struct vcpu *v)
+/* Updates all the things that are derived from the guest's CR3. 
+ * Called when the guest changes CR3. */
+{
+    shadow2_lock(v->domain);
+    v->arch.shadow2->update_cr3(v);
+    shadow2_unlock(v->domain);
+}
+
+
+/* Should be called after CR3 is updated.
+ * Updates vcpu->arch.cr3 and, for HVM guests, vcpu->arch.hvm_vcpu.cpu_cr3.
+ * 
+ * Also updates other state derived from CR3 (vcpu->arch.guest_vtable,
+ * shadow_vtable, etc).
+ *
+ * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
+ * for HVM guests, arch.monitor_table and hvm's guest CR3.
+ *
+ * Update ref counts to shadow tables appropriately.
+ * For PAE, relocate L3 entries, if necessary, into low memory.
+ */
+static inline void update_cr3(struct vcpu *v)
+{
+    unsigned long cr3_mfn=0;
+
+    if ( shadow2_mode_enabled(v->domain) )
+    {
+        shadow2_update_cr3(v);
+        return;
+    }
+
+#if CONFIG_PAGING_LEVELS == 4
+    if ( !(v->arch.flags & TF_kernel_mode) )
+        cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user);
+    else
+#endif
+        cr3_mfn = pagetable_get_pfn(v->arch.guest_table);
+
+    /* Update vcpu->arch.cr3 */
+    BUG_ON(cr3_mfn == 0);
+    make_cr3(v, cr3_mfn);
+}
+
+extern void sh2_update_paging_modes(struct vcpu *v);
+
+/* Should be called to initialise paging structures if the paging mode
+ * has changed, and when bringing up a VCPU for the first time. */
+static inline void shadow2_update_paging_modes(struct vcpu *v)
+{
+    ASSERT(shadow2_mode_enabled(v->domain));
+    shadow2_lock(v->domain);
+    sh2_update_paging_modes(v);
+    shadow2_unlock(v->domain);
+}
+
+static inline void
+shadow2_detach_old_tables(struct vcpu *v)
+{
+    v->arch.shadow2->detach_old_tables(v);
+}
+
+static inline mfn_t
+shadow2_make_monitor_table(struct vcpu *v)
+{
+    return v->arch.shadow2->make_monitor_table(v);
+}
+
+static inline void
+shadow2_destroy_monitor_table(struct vcpu *v, mfn_t mmfn)
+{
+    v->arch.shadow2->destroy_monitor_table(v, mmfn);
+}
+
+/* Validate a pagetable change from the guest and update the shadows. */
+extern int shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn,
+                                        void *new_guest_entry);
+
+/* Update the shadows in response to a pagetable write from a HVM guest */
+extern void shadow2_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn, 
+                                            void *entry, u32 size);
+
+/* Remove all writeable mappings of a guest frame from the shadows.
+ * Returns non-zero if we need to flush TLBs. 
+ * level and fault_addr desribe how we found this to be a pagetable;
+ * level==0 means we have some other reason for revoking write access. */
+extern int shadow2_remove_write_access(struct vcpu *v, mfn_t readonly_mfn,
+                                       unsigned int level,
+                                       unsigned long fault_addr);
+
+/* Remove all mappings of the guest mfn from the shadows. 
+ * Returns non-zero if we need to flush TLBs. */
+extern int shadow2_remove_all_mappings(struct vcpu *v, mfn_t target_mfn);
+
+void
+shadow2_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn);
+/* This is a HVM page that we thing is no longer a pagetable.
+ * Unshadow it, and recursively unshadow pages that reference it. */
+
+/* Remove all shadows of the guest mfn. */
+extern void sh2_remove_shadows(struct vcpu *v, mfn_t gmfn, int all);
+static inline void shadow2_remove_all_shadows(struct vcpu *v, mfn_t gmfn)
+{
+    sh2_remove_shadows(v, gmfn, 1);
+}
+
+/* Add a page to a domain */
+void
+shadow2_guest_physmap_add_page(struct domain *d, unsigned long gfn,
+                               unsigned long mfn);
+
+/* Remove a page from a domain */
+void
+shadow2_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
+                                  unsigned long mfn);
+
+/*
+ * Definitions for the shadow2_flags field in page_info.
+ * These flags are stored on *guest* pages...
+ * Bits 1-13 are encodings for the shadow types.
+ */
+#define PGC_SH2_type_to_index(_type) ((_type) >> PGC_SH2_type_shift)
+#define SH2F_page_type_mask \
+    (((1u << PGC_SH2_type_to_index(PGC_SH2_max_shadow + 1u)) - 1u) - \
+     ((1u << PGC_SH2_type_to_index(PGC_SH2_min_shadow)) - 1u))
+
+#define SH2F_L1_32   (1u << PGC_SH2_type_to_index(PGC_SH2_l1_32_shadow))
+#define SH2F_FL1_32  (1u << PGC_SH2_type_to_index(PGC_SH2_fl1_32_shadow))
+#define SH2F_L2_32   (1u << PGC_SH2_type_to_index(PGC_SH2_l2_32_shadow))
+#define SH2F_L1_PAE  (1u << PGC_SH2_type_to_index(PGC_SH2_l1_pae_shadow))
+#define SH2F_FL1_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_fl1_pae_shadow))
+#define SH2F_L2_PAE  (1u << PGC_SH2_type_to_index(PGC_SH2_l2_pae_shadow))
+#define SH2F_L2H_PAE (1u << PGC_SH2_type_to_index(PGC_SH2_l2h_pae_shadow))
+#define SH2F_L3_PAE  (1u << PGC_SH2_type_to_index(PGC_SH2_l3_pae_shadow))
+#define SH2F_L1_64   (1u << PGC_SH2_type_to_index(PGC_SH2_l1_64_shadow))
+#define SH2F_FL1_64  (1u << PGC_SH2_type_to_index(PGC_SH2_fl1_64_shadow))
+#define SH2F_L2_64   (1u << PGC_SH2_type_to_index(PGC_SH2_l2_64_shadow))
+#define SH2F_L3_64   (1u << PGC_SH2_type_to_index(PGC_SH2_l3_64_shadow))
+#define SH2F_L4_64   (1u << PGC_SH2_type_to_index(PGC_SH2_l4_64_shadow))
+
+/* Used for hysteresis when automatically unhooking mappings on fork/exit */
+#define SH2F_unhooked_mappings (1u<<31)
+
+/* 
+ * Allocation of shadow pages 
+ */
+
+/* Return the minumum acceptable number of shadow pages a domain needs */
+unsigned int shadow2_min_acceptable_pages(struct domain *d);
+
+/* Set the pool of shadow pages to the required number of MB.
+ * Input will be rounded up to at least min_acceptable_shadow_pages().
+ * Returns 0 for success, 1 for failure. */
+unsigned int shadow2_set_allocation(struct domain *d, 
+                                    unsigned int megabytes,
+                                    int *preempted);
+
+/* Return the size of the shadow2 pool, rounded up to the nearest MB */
+static inline unsigned int shadow2_get_allocation(struct domain *d)
+{
+    unsigned int pg = d->arch.shadow2_total_pages;
+    return ((pg >> (20 - PAGE_SHIFT))
+            + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0));
+}
+
+/*
+ * Linked list for chaining entries in the shadow hash table. 
+ */
+struct shadow2_hash_entry {
+    struct shadow2_hash_entry *next;
+    mfn_t smfn;                 /* MFN of the shadow */
+#ifdef _x86_64_ /* Shorten 'n' so we don't waste a whole word on storing 't' */
+    unsigned long n:56;         /* MFN of guest PT or GFN of guest superpage */
+#else
+    unsigned long n;            /* MFN of guest PT or GFN of guest superpage */
+#endif
+    unsigned char t;            /* shadow type bits, or 0 for empty */
+};
+
+#define SHADOW2_HASH_BUCKETS 251
+/* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */
+
+
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_CACHE_WALKS
+/* Optimization: cache the results of guest walks.  This helps with MMIO
+ * and emulated writes, which tend to issue very similar walk requests
+ * repeatedly.  We keep the results of the last few walks, and blow
+ * away the cache on guest cr3 write, mode change, or page fault. */
+
+#define SH2_WALK_CACHE_ENTRIES 4
+
+/* Rather than cache a guest walk, which would include mapped pointers 
+ * to pages, we cache what a TLB would remember about the walk: the 
+ * permissions and the l1 gfn */
+struct shadow2_walk_cache {
+    unsigned long va;           /* The virtual address (or 0 == unused) */
+    unsigned long gfn;          /* The gfn from the effective l1e   */
+    u32 permissions;            /* The aggregated permission bits   */
+};
+#endif
+
+
+/**************************************************************************/
+/* Guest physmap (p2m) support */
+
+/* Walk another domain's P2M table, mapping pages as we go */
+extern mfn_t
+sh2_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn);
+
+
+/* General conversion function from gfn to mfn */
+static inline mfn_t
+sh2_gfn_to_mfn(struct domain *d, unsigned long gfn)
+{
+    if ( !shadow2_mode_translate(d) )
+        return _mfn(gfn);
+    else if ( likely(current->domain == d) )
+        return _mfn(get_mfn_from_gpfn(gfn));
+    else
+        return sh2_gfn_to_mfn_foreign(d, gfn);
+}
+
+// vcpu-specific version of gfn_to_mfn().  This is where we hide the dirty
+// little secret that, for hvm guests with paging disabled, nearly all of the
+// shadow code actually think that the guest is running on *untranslated* page
+// tables (which is actually domain->phys_table).
+//
+static inline mfn_t
+sh2_vcpu_gfn_to_mfn(struct vcpu *v, unsigned long gfn)
+{ 
+    if ( !shadow2_vcpu_mode_translate(v) )
+        return _mfn(gfn);
+    if ( likely(current->domain == v->domain) )
+        return _mfn(get_mfn_from_gpfn(gfn));
+    return sh2_gfn_to_mfn_foreign(v->domain, gfn);
+}
+
+static inline unsigned long
+sh2_mfn_to_gfn(struct domain *d, mfn_t mfn)
+{
+    if ( shadow2_mode_translate(d) )
+        return get_gpfn_from_mfn(mfn_x(mfn));
+    else
+        return mfn_x(mfn);
+}
+
+
+
+#endif /* _XEN_SHADOW2_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+      
diff --git a/xen/include/asm-x86/shadow_64.h b/xen/include/asm-x86/shadow_64.h
deleted file mode 100644 (file)
index d9afbdc..0000000
+++ /dev/null
@@ -1,587 +0,0 @@
-/******************************************************************************
- * include/asm-x86/shadow_64.h
- * 
- * Copyright (c) 2005 Michael A Fetterman
- * Based on an earlier implementation by Ian Pratt et al
- * 
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-/*
- * Jun Nakajima <jun.nakajima@intel.com>
- * Chengyuan Li <chengyuan.li@intel.com>
- *
- * Extended to support 64-bit guests.
- */
-#ifndef _XEN_SHADOW_64_H
-#define _XEN_SHADOW_64_H
-#include <asm/shadow.h>
-#include <asm/shadow_ops.h>
-#include <asm/hvm/hvm.h>
-
-/*
- * The naming convention of the shadow_ops:
- * MODE_<pgentry size>_<guest paging levels>_HANDLER
- */
-extern struct shadow_ops MODE_64_2_HANDLER;
-extern struct shadow_ops MODE_64_3_HANDLER;
-extern struct shadow_ops MODE_64_PAE_HANDLER;
-#if CONFIG_PAGING_LEVELS == 4
-extern struct shadow_ops MODE_64_4_HANDLER;
-#endif
-
-#if CONFIG_PAGING_LEVELS == 3
-#define L4_PAGETABLE_SHIFT      39
-#define L4_PAGETABLE_ENTRIES    (1<<PAGETABLE_ORDER)
-typedef struct { intpte_t l4; } l4_pgentry_t;
-#define is_guest_l4_slot(_s) (1)
-#endif
-
-#define READ_FAULT  0
-#define WRITE_FAULT 1
-
-#define ERROR_P     1
-#define ERROR_W     2
-#define ERROR_U     4
-#define ERROR_I     (1 << 4)
-
-#define X86_64_SHADOW_DEBUG 0
-
-#if X86_64_SHADOW_DEBUG
-#define ESH_LOG(_f, _a...)              \
-        printk(_f, ##_a)
-#else
-#define ESH_LOG(_f, _a...) ((void)0)
-#endif
-
-#define L_MASK  0xff
-
-#define PAE_PAGING_LEVELS   3
-
-#define ROOT_LEVEL_64   PAGING_L4
-#define ROOT_LEVEL_32   PAGING_L2
-
-#define DIRECT_ENTRY    (4UL << 16)
-#define SHADOW_ENTRY    (2UL << 16)
-#define GUEST_ENTRY     (1UL << 16)
-
-#define GET_ENTRY   (2UL << 8)
-#define SET_ENTRY   (1UL << 8)
-
-#define PAGETABLE_ENTRIES    (1<<PAGETABLE_ORDER)
-
-/* For 32-bit VMX guest to allocate shadow L1 & L2*/
-#define SL1_ORDER   1
-#define SL2_ORDER   2
-
-typedef struct { intpte_t lo; } pgentry_64_t;
-#define shadow_level_to_type(l)    (l << 29)
-#define shadow_type_to_level(t)    (t >> 29)
-
-#define entry_get_value(_x)         ((_x).lo)
-#define entry_get_pfn(_x)           \
-      (((_x).lo & (PADDR_MASK&PAGE_MASK)) >> PAGE_SHIFT)
-#define entry_get_paddr(_x)          (((_x).lo & (PADDR_MASK&PAGE_MASK)))
-#define entry_get_flags(_x)         (get_pte_flags((_x).lo))
-
-#define entry_empty()           ((pgentry_64_t) { 0 })
-#define entry_from_pfn(pfn, flags)  \
-    ((pgentry_64_t) { ((intpte_t)(pfn) << PAGE_SHIFT) | put_pte_flags(flags) })
-#define entry_from_page(page, flags) (entry_from_pfn(page_to_mfn(page),(flags)))
-#define entry_add_flags(x, flags)    ((x).lo |= put_pte_flags(flags))
-#define entry_remove_flags(x, flags) ((x).lo &= ~put_pte_flags(flags))
-#define entry_has_changed(x,y,flags) \
-        ( !!(((x).lo ^ (y).lo) & ((PADDR_MASK&PAGE_MASK)|put_pte_flags(flags))) )
-
-/******************************************************************************/
-/*
- * The macro and inlines are for 32-bit PAE guest 
- */
-#define PAE_PDPT_RESERVED   0x1e6 /* [8:5], [2,1] */
-
-#define PAE_SHADOW_SELF_ENTRY   259
-#define PAE_L3_PAGETABLE_ENTRIES   4
-
-/******************************************************************************/
-static inline int  table_offset_64(unsigned long va, int level)
-{
-    switch(level) {
-        case 1:
-            return  (((va) >> L1_PAGETABLE_SHIFT) & (L1_PAGETABLE_ENTRIES - 1));
-        case 2:
-            return  (((va) >> L2_PAGETABLE_SHIFT) & (L2_PAGETABLE_ENTRIES - 1));
-        case 3:
-            return  (((va) >> L3_PAGETABLE_SHIFT) & (L3_PAGETABLE_ENTRIES - 1));
-#if CONFIG_PAGING_LEVELS == 3
-        case 4:
-            return PAE_SHADOW_SELF_ENTRY;
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 4
-#ifndef GUEST_PGENTRY_32
-#ifndef GUEST_32PAE
-        case 4:
-            return  (((va) >> L4_PAGETABLE_SHIFT) & (L4_PAGETABLE_ENTRIES - 1));
-#else
-        case 4:
-            return PAE_SHADOW_SELF_ENTRY;
-#endif
-#else
-        case 4:
-            return PAE_SHADOW_SELF_ENTRY;
-#endif
-#endif
-        default:
-            return -1;
-    }
-}
-
-/*****************************************************************************/
-
-#if defined( GUEST_32PAE )
-static inline int guest_table_offset_64(unsigned long va, int level, unsigned int index)
-{
-    switch(level) {
-        case 1:
-            return  (((va) >> L1_PAGETABLE_SHIFT) & (L1_PAGETABLE_ENTRIES - 1));
-        case 2:
-            return  (((va) >> L2_PAGETABLE_SHIFT) & (L2_PAGETABLE_ENTRIES - 1));
-        case 3:
-            return  (index * 4 + ((va) >> L3_PAGETABLE_SHIFT));
-#if CONFIG_PAGING_LEVELS == 3
-        case 4:
-            return PAE_SHADOW_SELF_ENTRY;
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 4
-#ifndef GUEST_PGENTRY_32
-        case 4:
-            return  (((va) >> L4_PAGETABLE_SHIFT) & (L4_PAGETABLE_ENTRIES - 1));
-#else
-        case 4:
-            return PAE_SHADOW_SELF_ENTRY;
-#endif
-#endif
-        default:
-            return -1;
-    }
-}
-
-#define SH_GUEST_32PAE 1
-#else 
-#define guest_table_offset_64(va, level, index) \
-            table_offset_64((va),(level))
-#define SH_GUEST_32PAE 0
-#endif
-
-/********************************************************************************/
-
-static inline void free_out_of_sync_state(struct domain *d)
-{
-    struct out_of_sync_entry *entry;
-
-    // NB: Be careful not to call something that manipulates this list
-    //     while walking it.  Remove one item at a time, and always
-    //     restart from start of list.
-    //
-    while ( (entry = d->arch.out_of_sync) )
-    {
-        d->arch.out_of_sync = entry->next;
-        release_out_of_sync_entry(d, entry);
-
-        entry->next = d->arch.out_of_sync_free;
-        d->arch.out_of_sync_free = entry;
-    }
-}
-
-static inline int __entry(
-    struct vcpu *v, unsigned long va, pgentry_64_t *e_p, u32 flag)
-{
-    int i;
-    pgentry_64_t *le_e;
-    pgentry_64_t *le_p = NULL;
-    pgentry_64_t *phys_vtable = NULL;
-    unsigned long mfn;
-    int index;
-    u32 level = flag & L_MASK;
-    struct domain *d = v->domain;
-    int root_level;
-    unsigned int base_idx;
-
-    base_idx = get_cr3_idxval(v);
-
-    if ( flag & SHADOW_ENTRY )
-    {
-        root_level =  ROOT_LEVEL_64;
-        index = table_offset_64(va, root_level);
-        le_e = (pgentry_64_t *)&v->arch.shadow_vtable[index];
-    }
-    else if ( flag & GUEST_ENTRY )
-    {
-        root_level = v->domain->arch.ops->guest_paging_levels;
-        if ( root_level == PAGING_L3 )
-            index = guest_table_offset_64(va, PAGING_L3, base_idx);
-        else
-            index = guest_table_offset_64(va, root_level, base_idx);
-        le_e = (pgentry_64_t *)&v->arch.guest_vtable[index];
-    }
-    else /* direct mode */
-    {
-        root_level = PAE_PAGING_LEVELS;
-        index = table_offset_64(va, root_level);
-        phys_vtable = (pgentry_64_t *)map_domain_page(
-            pagetable_get_pfn(v->domain->arch.phys_table));
-        le_e = &phys_vtable[index];
-    }
-
-    /*
-     * If it's not external mode, then mfn should be machine physical.
-     */
-    for ( i = root_level - level; i > 0; i-- )
-    {
-        if ( unlikely(!(entry_get_flags(*le_e) & _PAGE_PRESENT)) )
-        {
-            if ( le_p )
-                unmap_domain_page(le_p);
-
-            if ( phys_vtable )
-                unmap_domain_page(phys_vtable);
-
-            return 0;
-        }
-
-        mfn = entry_get_pfn(*le_e);
-        if ( (flag & GUEST_ENTRY) && shadow_mode_translate(d) )
-            mfn = get_mfn_from_gpfn(mfn);
-
-        if ( le_p )
-            unmap_domain_page(le_p);
-        le_p = (pgentry_64_t *)map_domain_page(mfn);
-
-        if ( flag & SHADOW_ENTRY )
-            index = table_offset_64(va, (level + i - 1));
-        else
-            index = guest_table_offset_64(va, (level + i - 1), base_idx);
-        le_e = &le_p[index];
-    }
-
-    if ( flag & SET_ENTRY )
-        *le_e = *e_p;
-    else
-        *e_p = *le_e;
-
-    if ( le_p )
-        unmap_domain_page(le_p);
-
-    if ( phys_vtable )
-        unmap_domain_page(phys_vtable);
-
-    return 1;
-}
-
-static inline int __rw_entry(
-    struct vcpu *v, unsigned long va, void *e_p, u32 flag)
-{
-    pgentry_64_t *e = (pgentry_64_t *)e_p;
-
-    if (e) {
-        return __entry(v, va, e, flag);
-    }
-
-    return 0;
-}
-
-#define __shadow_set_l4e(v, va, value) \
-  __rw_entry(v, va, value, SHADOW_ENTRY | SET_ENTRY | PAGING_L4)
-#define __shadow_get_l4e(v, va, sl4e) \
-  __rw_entry(v, va, sl4e, SHADOW_ENTRY | GET_ENTRY | PAGING_L4)
-#define __shadow_set_l3e(v, va, value) \
-  __rw_entry(v, va, value, SHADOW_ENTRY | SET_ENTRY | PAGING_L3)
-#define __shadow_get_l3e(v, va, sl3e) \
-  __rw_entry(v, va, sl3e, SHADOW_ENTRY | GET_ENTRY | PAGING_L3)
-#define __shadow_set_l2e(v, va, value) \
-  __rw_entry(v, va, value, SHADOW_ENTRY | SET_ENTRY | PAGING_L2)
-#define __shadow_get_l2e(v, va, sl2e) \
-  __rw_entry(v, va, sl2e, SHADOW_ENTRY | GET_ENTRY | PAGING_L2)
-#define __shadow_set_l1e(v, va, value) \
-  __rw_entry(v, va, value, SHADOW_ENTRY | SET_ENTRY | PAGING_L1)
-#define __shadow_get_l1e(v, va, sl1e) \
-  __rw_entry(v, va, sl1e, SHADOW_ENTRY | GET_ENTRY | PAGING_L1)
-
-#define __guest_set_l4e(v, va, value) \
-  __rw_entry(v, va, value, GUEST_ENTRY | SET_ENTRY | PAGING_L4)
-#define __guest_get_l4e(v, va, gl4e) \
-  __rw_entry(v, va, gl4e, GUEST_ENTRY | GET_ENTRY | PAGING_L4)
-#define __guest_set_l3e(v, va, value) \
-  __rw_entry(v, va, value, GUEST_ENTRY | SET_ENTRY | PAGING_L3)
-#define __guest_get_l3e(v, va, sl3e) \
-  __rw_entry(v, va, gl3e, GUEST_ENTRY | GET_ENTRY | PAGING_L3)
-
-#define __direct_set_l3e(v, va, value) \
-  __rw_entry(v, va, value, DIRECT_ENTRY | SET_ENTRY | PAGING_L3)
-#define __direct_get_l3e(v, va, sl3e) \
-  __rw_entry(v, va, sl3e, DIRECT_ENTRY | GET_ENTRY | PAGING_L3)
-#define __direct_set_l2e(v, va, value) \
-  __rw_entry(v, va, value, DIRECT_ENTRY | SET_ENTRY | PAGING_L2)
-#define __direct_get_l2e(v, va, sl2e) \
-  __rw_entry(v, va, sl2e, DIRECT_ENTRY | GET_ENTRY | PAGING_L2)
-#define __direct_set_l1e(v, va, value) \
-  __rw_entry(v, va, value, DIRECT_ENTRY | SET_ENTRY | PAGING_L1)
-#define __direct_get_l1e(v, va, sl1e) \
-  __rw_entry(v, va, sl1e, DIRECT_ENTRY | GET_ENTRY | PAGING_L1)
-
-
-static inline int  __guest_set_l2e(
-    struct vcpu *v, unsigned long va, void *value, int size)
-{
-    switch(size) {
-        case 4:
-            // 32-bit guest
-            {
-                l2_pgentry_32_t *l2va;
-
-                l2va = (l2_pgentry_32_t *)v->arch.guest_vtable;
-                if (value)
-                    l2va[l2_table_offset_32(va)] = *(l2_pgentry_32_t *)value;
-                return 1;
-            }
-        case 8:
-            return __rw_entry(v, va, value, GUEST_ENTRY | SET_ENTRY | PAGING_L2);
-        default:
-            BUG();
-            return 0;
-    }
-    return 0;
-}
-
-#define __guest_set_l2e(v, va, value) \
-    __guest_set_l2e(v, (unsigned long)va, value, sizeof(*value))
-
-static inline int  __guest_get_l2e(
-  struct vcpu *v, unsigned long va, void *gl2e, int size)
-{
-    switch(size) {
-        case 4:
-            // 32-bit guest
-            {
-                l2_pgentry_32_t *l2va;
-                l2va = (l2_pgentry_32_t *)v->arch.guest_vtable;
-                if (gl2e)
-                    *(l2_pgentry_32_t *)gl2e = l2va[l2_table_offset_32(va)];
-                return 1;
-            }
-        case 8:
-            return __rw_entry(v, va, gl2e, GUEST_ENTRY | GET_ENTRY | PAGING_L2);
-        default:
-            BUG();
-            return 0;
-    }
-    return 0;
-}
-
-#define __guest_get_l2e(v, va, gl2e) \
-    __guest_get_l2e(v, (unsigned long)va, gl2e, sizeof(*gl2e))
-
-static inline int  __guest_set_l1e(
-  struct vcpu *v, unsigned long va, void *value, int size)
-{
-    switch(size) {
-        case 4:
-            // 32-bit guest
-            {
-                l2_pgentry_32_t gl2e;
-                l1_pgentry_32_t *l1va;
-                unsigned long l1mfn;
-
-                if (!__guest_get_l2e(v, va, &gl2e))
-                    return 0;
-                if (unlikely(!(l2e_get_flags_32(gl2e) & _PAGE_PRESENT)))
-                    return 0;
-
-                l1mfn = get_mfn_from_gpfn(
-                  l2e_get_pfn(gl2e));
-
-                l1va = (l1_pgentry_32_t *)map_domain_page(l1mfn);
-                if (value)
-                    l1va[l1_table_offset_32(va)] = *(l1_pgentry_32_t *)value;
-                unmap_domain_page(l1va);
-
-                return 1;
-            }
-
-        case 8:
-            return __rw_entry(v, va, value, GUEST_ENTRY | SET_ENTRY | PAGING_L1);
-        default:
-            BUG();
-            return 0;
-    }
-    return 0;
-}
-
-#define __guest_set_l1e(v, va, value) \
-     __guest_set_l1e(v, (unsigned long)va, value, sizeof(*value))
-
-static inline int  __guest_get_l1e(
-  struct vcpu *v, unsigned long va, void *gl1e, int size)
-{
-    switch(size) {
-        case 4:
-            // 32-bit guest
-            {
-                l2_pgentry_32_t gl2e;
-                l1_pgentry_32_t *l1va;
-                unsigned long l1mfn;
-
-                if (!(__guest_get_l2e(v, va, &gl2e)))
-                    return 0;
-
-
-                if (unlikely(!(l2e_get_flags_32(gl2e) & _PAGE_PRESENT)))
-                    return 0;
-
-
-                l1mfn = get_mfn_from_gpfn(
-                  l2e_get_pfn(gl2e));
-                l1va = (l1_pgentry_32_t *) map_domain_page(l1mfn);
-                if (gl1e)
-                    *(l1_pgentry_32_t *)gl1e = l1va[l1_table_offset_32(va)];
-                unmap_domain_page(l1va);
-                return 1;
-            }
-        case 8:
-            // 64-bit guest
-            return __rw_entry(v, va, gl1e, GUEST_ENTRY | GET_ENTRY | PAGING_L1);
-        default:
-            BUG();
-            return 0;
-    }
-    return 0;
-}
-
-#define __guest_get_l1e(v, va, gl1e) \
-    __guest_get_l1e(v, (unsigned long)va, gl1e, sizeof(*gl1e))
-
-static inline void entry_general(
-  struct domain *d,
-  pgentry_64_t *gle_p,
-  pgentry_64_t *sle_p,
-  unsigned long smfn, u32 level)
-
-{
-    pgentry_64_t gle = *gle_p;
-    pgentry_64_t sle;
-
-    sle = entry_empty();
-    if ( (entry_get_flags(gle) & _PAGE_PRESENT) && (smfn != 0) )
-    {
-        if ((entry_get_flags(gle) & _PAGE_PSE) && level == PAGING_L2) {
-            sle = entry_from_pfn(smfn, entry_get_flags(gle));
-            entry_remove_flags(sle, _PAGE_PSE);
-
-            if ( shadow_mode_log_dirty(d) ||
-                 !(entry_get_flags(gle) & _PAGE_DIRTY) )
-            {
-                pgentry_64_t *l1_p;
-                int i;
-
-                l1_p =(pgentry_64_t *)map_domain_page(smfn);
-                for (i = 0; i < L1_PAGETABLE_ENTRIES; i++)
-                {
-                    if ( mfn_is_page_table(entry_get_pfn(l1_p[i])) )
-                        entry_remove_flags(l1_p[i], _PAGE_RW);
-                }
-
-                unmap_domain_page(l1_p);
-            }
-        } else {
-            if (d->arch.ops->guest_paging_levels <= PAGING_L3
-                    && level == PAGING_L3) {
-                sle = entry_from_pfn(smfn, entry_get_flags(gle));
-            } else {
-
-                sle = entry_from_pfn(
-                  smfn,
-                  (entry_get_flags(gle) | _PAGE_RW | _PAGE_ACCESSED) & ~_PAGE_AVAIL);
-                entry_add_flags(gle, _PAGE_ACCESSED);
-            }
-        }
-        // XXX mafetter: Hmm...
-        //     Shouldn't the dirty log be checked/updated here?
-        //     Actually, it needs to be done in this function's callers.
-        //
-        *gle_p = gle;
-    }
-
-    if ( entry_get_value(sle) || entry_get_value(gle) )
-        SH_VVLOG("%s: gpde=%lx, new spde=%lx", __func__,
-          entry_get_value(gle), entry_get_value(sle));
-
-    *sle_p = sle;
-}
-
-static inline void entry_propagate_from_guest(
-  struct domain *d, pgentry_64_t *gle_p, pgentry_64_t *sle_p, u32 level)
-{
-    pgentry_64_t gle = *gle_p;
-    unsigned long smfn = 0;
-
-    if ( entry_get_flags(gle) & _PAGE_PRESENT ) {
-        if ((entry_get_flags(gle) & _PAGE_PSE) && level == PAGING_L2) {
-            smfn =  __shadow_status(d, entry_get_pfn(gle), PGT_fl1_shadow);
-        } else {
-            smfn =  __shadow_status(d, entry_get_pfn(gle), 
-              shadow_level_to_type((level -1 )));
-        }
-    }
-    entry_general(d, gle_p, sle_p, smfn, level);
-
-}
-
-static int inline
-validate_entry_change(
-  struct domain *d,
-  pgentry_64_t *new_gle_p,
-  pgentry_64_t *shadow_le_p,
-  u32 level)
-{
-    pgentry_64_t old_sle, new_sle;
-    pgentry_64_t new_gle = *new_gle_p;
-
-    old_sle = *shadow_le_p;
-    entry_propagate_from_guest(d, &new_gle, &new_sle, level);
-
-    ESH_LOG("old_sle: %lx, new_gle: %lx, new_sle: %lx\n",
-      entry_get_value(old_sle), entry_get_value(new_gle),
-      entry_get_value(new_sle));
-
-    if ( ((entry_get_value(old_sle) | entry_get_value(new_sle)) & _PAGE_PRESENT) &&
-      entry_has_changed(old_sle, new_sle, _PAGE_PRESENT) )
-    {
-        perfc_incrc(validate_entry_changes);
-
-        if ( (entry_get_flags(new_sle) & _PAGE_PRESENT) &&
-          !get_shadow_ref(entry_get_pfn(new_sle)) )
-            BUG();
-        if ( entry_get_flags(old_sle) & _PAGE_PRESENT )
-            put_shadow_ref(entry_get_pfn(old_sle));
-    }
-
-    *shadow_le_p = new_sle;
-
-    return 1;
-}
-
-#endif
-
-
diff --git a/xen/include/asm-x86/shadow_ops.h b/xen/include/asm-x86/shadow_ops.h
deleted file mode 100644 (file)
index 8765ed8..0000000
+++ /dev/null
@@ -1,138 +0,0 @@
-/******************************************************************************
- * include/asm-x86/shadow_ops.h
- * 
- * Copyright (c) 2005 Michael A Fetterman
- * Based on an earlier implementation by Ian Pratt et al
- * 
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-#ifndef _XEN_SHADOW_OPS_H
-#define _XEN_SHADOW_OPS_H
-
-#define PAGING_L4      4UL
-#define PAGING_L3      3UL
-#define PAGING_L2      2UL
-#define PAGING_L1      1UL
-
-#define PAE_CR3_ALIGN       5
-#define PAE_CR3_IDX_MASK    0x7f
-
-#if defined( GUEST_PGENTRY_32 )
-
-#define GUEST_L1_PAGETABLE_ENTRIES     L1_PAGETABLE_ENTRIES_32
-#define GUEST_L2_PAGETABLE_ENTRIES     L2_PAGETABLE_ENTRIES_32
-#define GUEST_ROOT_PAGETABLE_ENTRIES   ROOT_PAGETABLE_ENTRIES_32
-#define GUEST_L2_PAGETABLE_SHIFT       L2_PAGETABLE_SHIFT_32
-
-#define guest_l1_pgentry_t      l1_pgentry_32_t
-#define guest_l2_pgentry_t      l2_pgentry_32_t
-#define guest_root_pgentry_t    l2_pgentry_32_t
-
-#define guest_l1e_get_paddr     l1e_get_paddr_32
-#define guest_l2e_get_paddr     l2e_get_paddr_32
-
-#define guest_get_pte_flags     get_pte_flags_32
-#define guest_put_pte_flags     put_pte_flags_32
-
-#define guest_l1e_get_flags     l1e_get_flags_32
-#define guest_l2e_get_flags     l2e_get_flags_32
-#define guest_root_get_flags          l2e_get_flags_32
-#define guest_root_get_intpte         l2e_get_intpte
-
-#define guest_l1e_empty         l1e_empty_32
-#define guest_l2e_empty         l2e_empty_32
-
-#define guest_l1e_from_pfn      l1e_from_pfn_32
-#define guest_l2e_from_pfn      l2e_from_pfn_32
-
-#define guest_l1e_from_paddr    l1e_from_paddr_32
-#define guest_l2e_from_paddr    l2e_from_paddr_32
-
-#define guest_l1e_from_page     l1e_from_page_32
-#define guest_l2e_from_page     l2e_from_page_32
-
-#define guest_l1e_add_flags     l1e_add_flags_32
-#define guest_l2e_add_flags     l2e_add_flags_32
-
-#define guest_l1e_remove_flag   l1e_remove_flags_32
-#define guest_l2e_remove_flag   l2e_remove_flags_32
-
-#define guest_l1e_has_changed   l1e_has_changed_32
-#define guest_l2e_has_changed   l2e_has_changed_32
-#define root_entry_has_changed  l2e_has_changed_32
-
-#define guest_l1_table_offset   l1_table_offset_32
-#define guest_l2_table_offset   l2_table_offset_32
-
-#define guest_linear_l1_table   linear_pg_table_32
-#define guest_linear_l2_table   linear_l2_table_32
-
-#define guest_va_to_l1mfn       va_to_l1mfn_32
-
-#else
-
-#define GUEST_L1_PAGETABLE_ENTRIES      L1_PAGETABLE_ENTRIES
-#define GUEST_L2_PAGETABLE_ENTRIES      L2_PAGETABLE_ENTRIES
-#define GUEST_ROOT_PAGETABLE_ENTRIES    ROOT_PAGETABLE_ENTRIES
-#define GUEST_L2_PAGETABLE_SHIFT        L2_PAGETABLE_SHIFT
-
-#define guest_l1_pgentry_t      l1_pgentry_t
-#define guest_l2_pgentry_t      l2_pgentry_t
-#define guest_root_pgentry_t    l4_pgentry_t
-
-#define guest_l1e_get_paddr     l1e_get_paddr
-#define guest_l2e_get_paddr     l2e_get_paddr
-
-#define guest_get_pte_flags     get_pte_flags
-#define guest_put_pte_flags     put_pte_flags
-
-#define guest_l1e_get_flags     l1e_get_flags
-#define guest_l2e_get_flags     l2e_get_flags
-#define guest_root_get_flags    l4e_get_flags
-#define guest_root_get_intpte   l4e_get_intpte
-
-#define guest_l1e_empty         l1e_empty
-#define guest_l2e_empty         l2e_empty
-
-#define guest_l1e_from_pfn      l1e_from_pfn
-#define guest_l2e_from_pfn      l2e_from_pfn
-
-#define guest_l1e_from_paddr    l1e_from_paddr
-#define guest_l2e_from_paddr    l2e_from_paddr
-
-#define guest_l1e_from_page     l1e_from_page
-#define guest_l2e_from_page     l2e_from_page
-
-#define guest_l1e_add_flags     l1e_add_flags
-#define guest_l2e_add_flags     l2e_add_flags
-
-#define guest_l1e_remove_flag   l1e_remove_flags
-#define guest_l2e_remove_flag   l2e_remove_flags
-
-#define guest_l1e_has_changed   l1e_has_changed
-#define guest_l2e_has_changed   l2e_has_changed
-#define root_entry_has_changed  l4e_has_changed
-
-#define guest_l1_table_offset   l1_table_offset
-#define guest_l2_table_offset   l2_table_offset
-
-#define guest_linear_l1_table   linear_pg_table
-#define guest_linear_l2_table   linear_l2_table
-
-#define guest_va_to_l1mfn       va_to_l1mfn
-#endif
-
-#endif /* _XEN_SHADOW_OPS_H */
diff --git a/xen/include/asm-x86/shadow_public.h b/xen/include/asm-x86/shadow_public.h
deleted file mode 100644 (file)
index e2b4b5f..0000000
+++ /dev/null
@@ -1,61 +0,0 @@
-/******************************************************************************
- * include/asm-x86/shadow_public.h
- * 
- * Copyright (c) 2005 Michael A Fetterman
- * Based on an earlier implementation by Ian Pratt et al
- * 
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-#ifndef _XEN_SHADOW_PUBLIC_H
-#define _XEN_SHADOW_PUBLIC_H
-
-#if CONFIG_PAGING_LEVELS >= 3
-#define MFN_PINNED(_x) (mfn_to_page(_x)->u.inuse.type_info & PGT_pinned)
-
-extern void shadow_sync_and_drop_references(
-      struct domain *d, struct page_info *page);
-extern void shadow_drop_references(
-      struct domain *d, struct page_info *page);
-
-extern int shadow_set_guest_paging_levels(struct domain *d, int levels);
-
-extern void release_out_of_sync_entry(
-    struct domain *d, struct out_of_sync_entry *entry);
-
-struct shadow_ops {
-    unsigned long guest_paging_levels; /* guest paging levels */
-    void (*invlpg)(struct vcpu *v, unsigned long va);
-    int  (*fault)(unsigned long va, struct cpu_user_regs *regs);
-    void (*update_pagetables)(struct vcpu *v);
-    void (*sync_all)(struct domain *d);
-    int  (*remove_all_write_access)(struct domain *d,
-             unsigned long readonly_gpfn, unsigned long readonly_gmfn);
-    int  (*do_update_va_mapping)(unsigned long va, l1_pgentry_t val, struct vcpu *v);
-    struct out_of_sync_entry *
-         (*mark_mfn_out_of_sync)(struct vcpu *v, unsigned long gpfn,
-                              unsigned long mfn);
-    int  (*is_out_of_sync)(struct vcpu *v, unsigned long va);
-    unsigned long (*gva_to_gpa)(unsigned long gva);
-};
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 4
-extern void shadow_l4_normal_pt_update(struct domain *d,
-                                       unsigned long pa, l4_pgentry_t l4e,
-                                       struct domain_mmap_cache *cache);
-#endif
-
-#endif
index 764b1c2c0501e1cbdeb3c1d0d68d682e1e08c61f..7f450c4624b6920c2a4563219b43efef57e626c3 100644 (file)
@@ -46,6 +46,7 @@ typedef l2_pgentry_t root_pgentry_t;
  *  12-bit flags = (pte[11:0])
  */
 
+#define _PAGE_NX_BIT            0U
 #define _PAGE_NX                0U
 
 /* Extract flags into 12-bit integer, or turn 12-bit flags into a pte mask. */
index 43e73033e35eefd6dfc8f36a08c2f557f3129f1f..e0187478cca0acca1ff6d63b7e7e0383dd6faa13 100644 (file)
@@ -59,7 +59,8 @@ typedef l3_pgentry_t root_pgentry_t;
  *  32-bit flags = (pte[63:44],pte[11:0])
  */
 
-#define _PAGE_NX (cpu_has_nx ? (1<<31) : 0)
+#define _PAGE_NX_BIT (1U<<31)
+#define _PAGE_NX     (cpu_has_nx ? _PAGE_NX_BIT : 0)
 
 /* Extract flags into 32-bit integer, or turn 32-bit flags into a pte mask. */
 #define get_pte_flags(x) (((int)((x) >> 32) & ~0xFFF) | ((int)(x) & 0xFFF))
index 0afb5e719bb9b24099c10115599d58ba778c964c..429cfb8c5d3af91599feea9c0e4df7d13e678835 100644 (file)
@@ -44,6 +44,8 @@ typedef l4_pgentry_t root_pgentry_t;
 /* Given a virtual address, get an entry offset into a linear page table. */
 #define l1_linear_offset(_a) (((_a) & VADDR_MASK) >> L1_PAGETABLE_SHIFT)
 #define l2_linear_offset(_a) (((_a) & VADDR_MASK) >> L2_PAGETABLE_SHIFT)
+#define l3_linear_offset(_a) (((_a) & VADDR_MASK) >> L3_PAGETABLE_SHIFT)
+#define l4_linear_offset(_a) (((_a) & VADDR_MASK) >> L4_PAGETABLE_SHIFT)
 
 #define is_guest_l1_slot(_s) (1)
 #define is_guest_l2_slot(_t, _s) (1)
@@ -70,7 +72,8 @@ typedef l4_pgentry_t root_pgentry_t;
 #define put_pte_flags(x) (((intpte_t)((x) & ~0xFFF) << 40) | ((x) & 0xFFF))
 
 /* Bit 23 of a 24-bit flag mask. This corresponds to bit 63 of a pte.*/
-#define _PAGE_NX (cpu_has_nx ? (1U<<23) : 0U)
+#define _PAGE_NX_BIT (1U<<23)
+#define _PAGE_NX     (cpu_has_nx ? _PAGE_NX_BIT : 0U)
 
 #define L1_DISALLOW_MASK BASE_DISALLOW_MASK
 #define L2_DISALLOW_MASK BASE_DISALLOW_MASK
index d211ca1624cc5b0ceab9f48deafd33e6529efe79..f12cc931089286d0edb4490c9bc12f1315c63708 100644 (file)
@@ -262,6 +262,18 @@ DEFINE_XEN_GUEST_HANDLE(dom0_sched_id_t);
 #define DOM0_SHADOW_CONTROL_OP_CLEAN       11
 #define DOM0_SHADOW_CONTROL_OP_PEEK        12
 
+/* Shadow2 operations */
+#define DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION   30
+#define DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION   31
+#define DOM0_SHADOW2_CONTROL_OP_ENABLE           32
+
+/* Mode flags for Shadow2 enable op */
+#define DOM0_SHADOW2_CONTROL_FLAG_ENABLE    (1 << 0)
+#define DOM0_SHADOW2_CONTROL_FLAG_REFCOUNT  (1 << 1)
+#define DOM0_SHADOW2_CONTROL_FLAG_LOG_DIRTY (1 << 2)
+#define DOM0_SHADOW2_CONTROL_FLAG_TRANSLATE (1 << 3)
+#define DOM0_SHADOW2_CONTROL_FLAG_EXTERNAL  (1 << 4)
+
 struct dom0_shadow_control_stats {
     uint32_t fault_count;
     uint32_t dirty_count;
@@ -277,7 +289,9 @@ struct dom0_shadow_control {
     uint32_t       op;
     XEN_GUEST_HANDLE(ulong) dirty_bitmap;
     /* IN/OUT variables. */
-    uint64_t       pages;        /* size of buffer, updated with actual size */
+    uint64_t       pages;    /* size of buffer, updated with actual size */
+    uint32_t       mb;       /* Shadow2 memory allocation in MB */
+    uint32_t       mode;     /* Shadow2 mode to enable */
     /* OUT variables. */
     struct dom0_shadow_control_stats stats;
 };
index 03d7af5f0f6f2ad9ccab9d95d0ea2a76fdce6db2..2a51fcbacb6b54c2e9d4ad5a325a70e242e2b525 100644 (file)
@@ -26,6 +26,13 @@ extern void *map_domain_page(unsigned long pfn);
  */
 extern void unmap_domain_page(void *va);
 
+/* 
+ * Convert a VA (within a page previously mapped in the context of the
+ * currently-executing VCPU via a call to map_domain_pages()) to a machine 
+ * address 
+ */
+extern paddr_t mapped_domain_page_to_maddr(void *va);
+
 /*
  * Similar to the above calls, except the mapping is accessible in all
  * address spaces (not just within the VCPU that created the mapping). Global
@@ -98,6 +105,7 @@ domain_mmap_cache_destroy(struct domain_mmap_cache *cache)
 
 #define map_domain_page(pfn)                maddr_to_virt((pfn)<<PAGE_SHIFT)
 #define unmap_domain_page(va)               ((void)(va))
+#define mapped_domain_page_to_maddr(va)     (virt_to_maddr(va))
 
 #define map_domain_page_global(pfn)         maddr_to_virt((pfn)<<PAGE_SHIFT)
 #define unmap_domain_page_global(va)        ((void)(va))
@@ -112,4 +120,9 @@ struct domain_mmap_cache {
 
 #endif /* !CONFIG_DOMAIN_PAGE */
 
+#define HERE_I_AM \
+do { \
+    printk("HERE I AM: %s %s %d\n", __func__, __FILE__, __LINE__); \
+} while (0)
+
 #endif /* __XEN_DOMAIN_PAGE_H__ */
index e2c67a1d46a23d21c81aded4168afdc7c906d29a..e7d84afd927238aa6430d338fe9623c9a5ace7ae 100644 (file)
@@ -18,7 +18,7 @@ extern void __bug(char *file, int line) __attribute__((noreturn));
 #ifndef NDEBUG
 #define ASSERT(_p)                                                      \
     do {                                                                \
-        if ( !(_p) )                                                    \
+        if ( unlikely(!(_p)) )                                          \
         {                                                               \
             printk("Assertion '%s' failed, line %d, file %s\n", #_p ,   \
                    __LINE__, __FILE__);                                 \
@@ -41,7 +41,7 @@ struct domain;
 void cmdline_parse(char *cmdline);
 
 #ifndef NDEBUG
-extern int debugtrace_send_to_console;
+extern void debugtrace_toggle(void);
 extern void debugtrace_dump(void);
 extern void debugtrace_printk(const char *fmt, ...);
 #else
index 66cdfc814ba721e9c29417044ec40a3412e45f18..5072d0b924917d32a0eda4190275272cbfcd168f 100644 (file)
@@ -161,6 +161,16 @@ static __inline__ void list_splice(struct list_head *list, struct list_head *hea
        for (pos = (head)->next, n = pos->next; pos != (head); \
                pos = n, n = pos->next)
 
+/**
+ * list_for_each_backwards_safe        -       iterate backwards over a list safe against removal of list entry
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @n:         another &struct list_head to use as temporary storage
+ * @head:      the head for your list.
+ */
+#define list_for_each_backwards_safe(pos, n, head) \
+       for (pos = (head)->prev, n = pos->prev; pos != (head); \
+               pos = n, n = pos->prev)
+
 /**
  * list_for_each_entry -       iterate over list of given type
  * @pos:       the type * to use as a loop counter.
index c37e60f23ac23fd07b2e681cf037d86673a525a9..d90b27adc7f021b003835e40151d690af299e087 100644 (file)
@@ -376,9 +376,12 @@ extern struct domain *domain_list;
  /* VCPU is paused by the hypervisor? */
 #define _VCPUF_paused          11
 #define VCPUF_paused           (1UL<<_VCPUF_paused)
- /* VCPU is blocked awaiting an event to be consumed by Xen. */
+/* VCPU is blocked awaiting an event to be consumed by Xen. */
 #define _VCPUF_blocked_in_xen  12
 #define VCPUF_blocked_in_xen   (1UL<<_VCPUF_blocked_in_xen)
+ /* HVM vcpu thinks CR0.PG == 0 */
+#define _VCPUF_shadow2_translate 13
+#define VCPUF_shadow2_translate  (1UL<<_VCPUF_shadow2_translate)
 
 /*
  * Per-domain flags (domain_flags).